LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2543 2766 91.9 %
Date: 2025-10-21 22:35:35 Functions: 685 728 94.1 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_float.h"
      36             : #include "cpl_progress.h"
      37             : #include "cpl_string.h"
      38             : #include "cpl_vsi.h"
      39             : #include "gdal_priv_templates.hpp"
      40             : #include "gdal_vrt.h"
      41             : #include "gdalwarper.h"
      42             : #include "memdataset.h"
      43             : #include "vrtdataset.h"
      44             : 
      45             : #if defined(__x86_64) || defined(_M_X64)
      46             : #include <emmintrin.h>
      47             : #define HAVE_SSE2
      48             : #elif defined(USE_NEON_OPTIMIZATIONS)
      49             : #include "include_sse2neon.h"
      50             : #define HAVE_SSE2
      51             : #endif
      52             : 
      53             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      54             : #include "rasterio_ssse3.h"
      55             : #ifdef __SSSE3__
      56             : #include <tmmintrin.h>
      57             : #endif
      58             : #endif
      59             : 
      60             : #ifdef __SSE4_1__
      61             : #include <smmintrin.h>
      62             : #endif
      63             : 
      64             : #ifdef __GNUC__
      65             : #define CPL_NOINLINE __attribute__((noinline))
      66             : #else
      67             : #define CPL_NOINLINE
      68             : #endif
      69             : 
      70             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      71             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      72             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      73             : 
      74             : /************************************************************************/
      75             : /*                    DownsamplingIntegerXFactor()                      */
      76             : /************************************************************************/
      77             : 
      78             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      79      695780 : static bool DownsamplingIntegerXFactor(
      80             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      81             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      82             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      83             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      84             : {
      85      695780 :     const int nBandDataSize =
      86             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      87      695780 :     int nOuterLoopIters = nBufXSize - 1;
      88      695780 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      89             :     const GByte *CPL_RESTRICT pabySrcData;
      90      695780 :     int nEndBlockX = nBlockXSize + nStartBlockX;
      91             : 
      92      695780 :     if (iSrcX < nEndBlockX)
      93             :     {
      94      294999 :         CPLAssert(poBlock);
      95      294999 :         goto no_reload_block;
      96             :     }
      97      400781 :     goto reload_block;
      98             : 
      99             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
     100             :     // nRasterXSize - 1
     101     1264973 :     while (--nOuterLoopIters >= 1)
     102             :     {
     103      201834 :         iSrcX += nSrcXInc;
     104      201834 :         pabySrcData += nIncSrcOffset;
     105      201834 :         pabyDstData += nPixelSpace;
     106             : 
     107             :         /* --------------------------------------------------------------------
     108             :          */
     109             :         /*      Ensure we have the appropriate block loaded. */
     110             :         /* --------------------------------------------------------------------
     111             :          */
     112      201834 :         if (iSrcX >= nEndBlockX)
     113             :         {
     114      201834 :         reload_block:
     115             :         {
     116      615205 :             const int nLBlockX = iSrcX / nBlockXSize;
     117      615205 :             nStartBlockX = nLBlockX * nBlockXSize;
     118      615205 :             nEndBlockX = nStartBlockX + nBlockXSize;
     119             : 
     120      615205 :             if (poBlock != nullptr)
     121      341376 :                 poBlock->DropLock();
     122             : 
     123      615205 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     124      615205 :             if (poBlock == nullptr)
     125             :             {
     126           1 :                 return false;
     127             :             }
     128             :         }
     129             : 
     130      615204 :         no_reload_block:
     131             :             const GByte *pabySrcBlock =
     132     1264973 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     133     1264973 :             GPtrDiff_t iSrcOffset =
     134     1264973 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     135     1264973 :             pabySrcData = pabySrcBlock + iSrcOffset;
     136             :         }
     137             : 
     138             :         /* --------------------------------------------------------------------
     139             :          */
     140             :         /*      Copy the maximum run of pixels. */
     141             :         /* --------------------------------------------------------------------
     142             :          */
     143             : 
     144     1264973 :         const int nIters = std::min(
     145     1264973 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     146             :         if (bSameDataType)
     147             :         {
     148     1264530 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     149     1264530 :             if (nIters > 1)
     150             :             {
     151             :                 if (DATA_TYPE_SIZE == 1)
     152             :                 {
     153      326250 :                     pabySrcData += nIncSrcOffset;
     154      326250 :                     pabyDstData += nPixelSpace;
     155      326250 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     156      326250 :                                      nPixelSpace, nIters - 1);
     157      326250 :                     pabySrcData +=
     158      326250 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     159      326250 :                     pabyDstData +=
     160      326250 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     161             :                 }
     162             :                 else
     163             :                 {
     164     4395716 :                     for (int i = 0; i < nIters - 1; i++)
     165             :                     {
     166     4197550 :                         pabySrcData += nIncSrcOffset;
     167     4197550 :                         pabyDstData += nPixelSpace;
     168     4197550 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     169             :                     }
     170             :                 }
     171      524420 :                 iSrcX += nSrcXInc * (nIters - 1);
     172      524420 :                 nOuterLoopIters -= nIters - 1;
     173             :             }
     174             :         }
     175             :         else
     176             :         {
     177             :             // Type to type conversion ...
     178         443 :             GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     179         443 :                             eBufType, nPixelSpace, std::max(1, nIters));
     180         443 :             if (nIters > 1)
     181             :             {
     182         216 :                 pabySrcData +=
     183         216 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     184         216 :                 pabyDstData +=
     185         216 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     186         216 :                 iSrcX += nSrcXInc * (nIters - 1);
     187         216 :                 nOuterLoopIters -= nIters - 1;
     188             :             }
     189             :         }
     190             :     }
     191             : 
     192             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     193     1063139 :     if (nOuterLoopIters == 0)
     194             :     {
     195      367360 :         const int nRasterXSize = poBand->GetXSize();
     196      367360 :         iSrcX =
     197      734720 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     198      367360 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     199      367360 :         pabyDstData += nPixelSpace;
     200      367360 :         if (iSrcX < nEndBlockX)
     201             :         {
     202      354770 :             goto no_reload_block;
     203             :         }
     204       12590 :         goto reload_block;
     205             :     }
     206      695779 :     return true;
     207             : }
     208             : 
     209             : template <class A, class B>
     210     2729820 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
     211             : {
     212     2729820 :     return a * b;
     213             : }
     214             : 
     215             : /************************************************************************/
     216             : /*                             IRasterIO()                              */
     217             : /*                                                                      */
     218             : /*      Default internal implementation of RasterIO() ... utilizes      */
     219             : /*      the Block access methods to satisfy the request.  This would    */
     220             : /*      normally only be overridden by formats with overviews.          */
     221             : /************************************************************************/
     222             : 
     223     6117120 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     224             :                                  int nXSize, int nYSize, void *pData,
     225             :                                  int nBufXSize, int nBufYSize,
     226             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     227             :                                  GSpacing nLineSpace,
     228             :                                  GDALRasterIOExtraArg *psExtraArg)
     229             : 
     230             : {
     231     6117120 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     232             :     {
     233           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     234             :                  "An error occurred while writing a dirty block "
     235             :                  "from GDALRasterBand::IRasterIO");
     236           0 :         CPLErr eErr = eFlushBlockErr;
     237           0 :         eFlushBlockErr = CE_None;
     238           0 :         return eErr;
     239             :     }
     240     6117120 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     241             :     {
     242          25 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     243           0 :         return CE_Failure;
     244             :     }
     245             : 
     246     6117100 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     247     6117090 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     248     6117080 :     GByte dummyBlock[2] = {0, 0};
     249     6117080 :     GByte *pabySrcBlock =
     250             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     251     6117080 :     GDALRasterBlock *poBlock = nullptr;
     252     6117080 :     const bool bUseIntegerRequestCoords =
     253     6464140 :         (!psExtraArg->bFloatingPointWindowValidity ||
     254      347059 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     255      323678 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     256             : 
     257             :     /* ==================================================================== */
     258             :     /*      A common case is the data requested with the destination        */
     259             :     /*      is packed, and the block width is the raster width.             */
     260             :     /* ==================================================================== */
     261     6039650 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     262     3190780 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     263    12156800 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     264             :     {
     265     3078310 :         CPLErr eErr = CE_None;
     266     3078310 :         int nLBlockY = -1;
     267             : 
     268     9522570 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     269             :         {
     270     6445330 :             const int iSrcY = iBufYOff + nYOff;
     271             : 
     272     6445330 :             if (iSrcY < nLBlockY * nBlockYSize ||
     273     6445340 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     274             :             {
     275     3335260 :                 nLBlockY = iSrcY / nBlockYSize;
     276     3335260 :                 bool bJustInitialize =
     277      295418 :                     eRWFlag == GF_Write && nXOff == 0 &&
     278     3687760 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     279       57080 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     280             : 
     281             :                 // Is this a partial tile at right and/or bottom edges of
     282             :                 // the raster, and that is going to be completely written?
     283             :                 // If so, do not load it from storage, but zero it so that
     284             :                 // the content outsize of the validity area is initialized.
     285     3335260 :                 bool bMemZeroBuffer = false;
     286      295418 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     287       23861 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     288     3630770 :                     nYOff + nYSize == GetYSize() &&
     289          89 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     290             :                 {
     291          89 :                     bJustInitialize = true;
     292          89 :                     bMemZeroBuffer = true;
     293             :                 }
     294             : 
     295     3335260 :                 if (poBlock)
     296      256904 :                     poBlock->DropLock();
     297             : 
     298     3335260 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     299     3335170 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     300     3335360 :                 if (poBlock == nullptr)
     301             :                 {
     302        1078 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     303             :                         nullptr)
     304             :                     {
     305           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     306             :                                  "GetBlockRef failed at X block offset %d, "
     307             :                                  "Y block offset %d%s",
     308             :                                  0, nLBlockY,
     309           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     310           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     311             :                                      : "");
     312             :                     }
     313        1078 :                     eErr = CE_Failure;
     314        1078 :                     break;
     315             :                 }
     316             : 
     317     3334280 :                 if (eRWFlag == GF_Write)
     318      295418 :                     poBlock->MarkDirty();
     319             : 
     320     3334280 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     321     3334260 :                 if (bMemZeroBuffer)
     322             :                 {
     323          89 :                     memset(pabySrcBlock, 0,
     324          89 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     325          89 :                                nBlockXSize * nBlockYSize);
     326             :                 }
     327             :             }
     328             : 
     329     6444330 :             const auto nSrcByteOffset =
     330     6444330 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     331     6444330 :                      nBlockXSize +
     332     6444330 :                  nXOff) *
     333     6444330 :                 nBandDataSize;
     334             : 
     335     6444330 :             if (eDataType == eBufType)
     336             :             {
     337     2810400 :                 if (eRWFlag == GF_Read)
     338     2340510 :                     memcpy(static_cast<GByte *>(pData) +
     339     2340510 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     340     2340510 :                            pabySrcBlock + nSrcByteOffset,
     341             :                            static_cast<size_t>(nLineSpace));
     342             :                 else
     343      469892 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     344      469892 :                            static_cast<GByte *>(pData) +
     345      469892 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     346             :                            static_cast<size_t>(nLineSpace));
     347             :             }
     348             :             else
     349             :             {
     350             :                 // Type to type conversion.
     351     3633920 :                 if (eRWFlag == GF_Read)
     352     3611450 :                     GDALCopyWords64(
     353     3611450 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     354             :                         static_cast<GByte *>(pData) +
     355     3611450 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     356             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     357             :                 else
     358       22474 :                     GDALCopyWords64(static_cast<GByte *>(pData) +
     359       22474 :                                         static_cast<GPtrDiff_t>(iBufYOff) *
     360             :                                             nLineSpace,
     361             :                                     eBufType, static_cast<int>(nPixelSpace),
     362       22474 :                                     pabySrcBlock + nSrcByteOffset, eDataType,
     363             :                                     nBandDataSize, nBufXSize);
     364             :             }
     365             : 
     366     6527580 :             if (psExtraArg->pfnProgress != nullptr &&
     367       83314 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     368             :                                          psExtraArg->pProgressData))
     369             :             {
     370           5 :                 eErr = CE_Failure;
     371           5 :                 break;
     372             :             }
     373             :         }
     374             : 
     375     3078320 :         if (poBlock)
     376     3077320 :             poBlock->DropLock();
     377             : 
     378     3078400 :         return eErr;
     379             :     }
     380             : 
     381             :     /* ==================================================================== */
     382             :     /*      Do we have overviews that would be appropriate to satisfy       */
     383             :     /*      this request?                                                   */
     384             :     /* ==================================================================== */
     385     3038790 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     386             :         eRWFlag == GF_Read)
     387             :     {
     388             :         GDALRasterIOExtraArg sExtraArg;
     389        2967 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     390             : 
     391             :         const int nOverview =
     392        2967 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     393             :                                           nBufXSize, nBufYSize, &sExtraArg);
     394        2967 :         if (nOverview >= 0)
     395             :         {
     396        2892 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     397        2892 :             if (poOverviewBand == nullptr)
     398        2892 :                 return CE_Failure;
     399             : 
     400        2892 :             return poOverviewBand->RasterIO(
     401             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     402        2892 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     403             :         }
     404             :     }
     405             : 
     406      847070 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     407           6 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     408     3882910 :         nLineSpace == nPixelSpace * nBufXSize &&
     409           6 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     410             :     {
     411           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     412           0 :         return CE_None;
     413             :     }
     414             : 
     415             :     /* ==================================================================== */
     416             :     /*      The second case when we don't need subsample data but likely    */
     417             :     /*      need data type conversion.                                      */
     418             :     /* ==================================================================== */
     419     3035840 :     if (  // nPixelSpace == nBufDataSize &&
     420     3035840 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     421             :     {
     422             : #if DEBUG_VERBOSE
     423             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     424             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     425             : #endif
     426             : 
     427             :         /* --------------------------------------------------------------------
     428             :          */
     429             :         /*      Loop over buffer computing source locations. */
     430             :         /* --------------------------------------------------------------------
     431             :          */
     432             :         // Calculate starting values out of loop
     433     2470240 :         const int nLBlockXStart = nXOff / nBlockXSize;
     434     2470240 :         const int nXSpanEnd = nBufXSize + nXOff;
     435             : 
     436     2470240 :         int nYInc = 0;
     437     4979680 :         for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
     438     2509440 :              iBufYOff += nYInc, iSrcY += nYInc)
     439             :         {
     440     2509510 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     441             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     442     2509510 :             int nLBlockY = iSrcY / nBlockYSize;
     443     2509510 :             int nLBlockX = nLBlockXStart;
     444     2509510 :             int iSrcX = nXOff;
     445     5239270 :             while (iSrcX < nXSpanEnd)
     446             :             {
     447     2729820 :                 int nXSpan = nLBlockX * nBlockXSize;
     448     2729820 :                 if (nXSpan < INT_MAX - nBlockXSize)
     449     2729810 :                     nXSpan += nBlockXSize;
     450             :                 else
     451          13 :                     nXSpan = INT_MAX;
     452     2729820 :                 const int nXRight = nXSpan;
     453     2729820 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     454             : 
     455             :                 const size_t nXSpanSize =
     456     2729820 :                     CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
     457             : 
     458     2729810 :                 bool bJustInitialize =
     459     2042250 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     460       37308 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     461     4797690 :                     nXOff <= nLBlockX * nBlockXSize &&
     462       25632 :                     nXOff + nXSize >= nXRight;
     463             : 
     464             :                 // Is this a partial tile at right and/or bottom edges of
     465             :                 // the raster, and that is going to be completely written?
     466             :                 // If so, do not load it from storage, but zero it so that
     467             :                 // the content outsize of the validity area is initialized.
     468     2729810 :                 bool bMemZeroBuffer = false;
     469     2042250 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     470     2017850 :                     nXOff <= nLBlockX * nBlockXSize &&
     471     2016200 :                     nYOff <= nLBlockY * nBlockYSize &&
     472       12152 :                     (nXOff + nXSize >= nXRight ||
     473             :                      // cppcheck-suppress knownConditionTrueFalse
     474     4774770 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     475       11972 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     476       10750 :                      (nYOff + nYSize == GetYSize() &&
     477        1958 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     478             :                 {
     479        3180 :                     bJustInitialize = true;
     480        3180 :                     bMemZeroBuffer = true;
     481             :                 }
     482             : 
     483             :                 /* --------------------------------------------------------------------
     484             :                  */
     485             :                 /*      Ensure we have the appropriate block loaded. */
     486             :                 /* --------------------------------------------------------------------
     487             :                  */
     488     2729810 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     489     2729840 :                 poBlock =
     490     2729800 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     491     2729840 :                 if (!poBlock)
     492             :                 {
     493          71 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     494             :                         nullptr)
     495             :                     {
     496           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     497             :                                  "GetBlockRef failed at X block offset %d, "
     498             :                                  "Y block offset %d%s",
     499             :                                  nLBlockX, nLBlockY,
     500           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     501           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     502             :                                      : "");
     503             :                     }
     504          71 :                     return (CE_Failure);
     505             :                 }
     506             : 
     507     2729770 :                 if (eRWFlag == GF_Write)
     508     2042250 :                     poBlock->MarkDirty();
     509             : 
     510     2729760 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     511     2729760 :                 if (bMemZeroBuffer)
     512             :                 {
     513        3180 :                     memset(pabySrcBlock, 0,
     514        3180 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     515        3180 :                                nBlockXSize * nBlockYSize);
     516             :                 }
     517             :                 /* --------------------------------------------------------------------
     518             :                  */
     519             :                 /*      Copy over this chunk of data. */
     520             :                 /* --------------------------------------------------------------------
     521             :                  */
     522     2729760 :                 GPtrDiff_t iSrcOffset =
     523     2729760 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     524     2729760 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     525     2729760 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     526     2729760 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     527     2729760 :                          nBlockXSize) *
     528     2729760 :                     nBandDataSize;
     529             :                 // Fill up as many rows as possible for the loaded block.
     530     5459530 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     531     2729760 :                                           nBufYSize - iBufYOff);
     532    59506200 :                 for (int k = 0; k < kmax; k++)
     533             :                 {
     534    56776400 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     535             :                     {
     536    52841600 :                         if (eRWFlag == GF_Read)
     537    48408300 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     538    48408300 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     539    48408300 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     540             :                         else
     541     4433360 :                             memcpy(pabySrcBlock + iSrcOffset,
     542     4433360 :                                    static_cast<GByte *>(pData) + iBufOffset +
     543     4433360 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     544             :                                    nXSpanSize);
     545             :                     }
     546             :                     else
     547             :                     {
     548             :                         /* type to type conversion */
     549     3934780 :                         if (eRWFlag == GF_Read)
     550     3884920 :                             GDALCopyWords64(
     551     3884920 :                                 pabySrcBlock + iSrcOffset, eDataType,
     552             :                                 nBandDataSize,
     553     3884920 :                                 static_cast<GByte *>(pData) + iBufOffset +
     554     3884920 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     555             :                                 eBufType, static_cast<int>(nPixelSpace),
     556             :                                 nXSpan);
     557             :                         else
     558       49861 :                             GDALCopyWords64(
     559       49861 :                                 static_cast<GByte *>(pData) + iBufOffset +
     560       49861 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     561             :                                 eBufType, static_cast<int>(nPixelSpace),
     562       49861 :                                 pabySrcBlock + iSrcOffset, eDataType,
     563             :                                 nBandDataSize, nXSpan);
     564             :                     }
     565             : 
     566    56776400 :                     iSrcOffset +=
     567    56776400 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     568             :                 }
     569             : 
     570             :                 iBufOffset =
     571     2729760 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     572     2729760 :                 nLBlockX++;
     573     2729760 :                 iSrcX += nXSpan;
     574             : 
     575     2729760 :                 poBlock->DropLock();
     576     2729760 :                 poBlock = nullptr;
     577             :             }
     578             : 
     579             :             /* Compute the increment to go on a block boundary */
     580     2509450 :             nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     581             : 
     582     2511300 :             if (psExtraArg->pfnProgress != nullptr &&
     583        1855 :                 !psExtraArg->pfnProgress(
     584     2511300 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     585             :                     psExtraArg->pProgressData))
     586             :             {
     587           5 :                 return CE_Failure;
     588             :             }
     589             :         }
     590             : 
     591     2470170 :         return CE_None;
     592             :     }
     593             : 
     594             :     /* ==================================================================== */
     595             :     /*      Loop reading required source blocks to satisfy output           */
     596             :     /*      request.  This is the most general implementation.              */
     597             :     /* ==================================================================== */
     598             : 
     599      565595 :     double dfXOff = nXOff;
     600      565595 :     double dfYOff = nYOff;
     601      565595 :     double dfXSize = nXSize;
     602      565595 :     double dfYSize = nYSize;
     603      565595 :     if (psExtraArg->bFloatingPointWindowValidity)
     604             :     {
     605      230598 :         dfXOff = psExtraArg->dfXOff;
     606      230598 :         dfYOff = psExtraArg->dfYOff;
     607      230598 :         dfXSize = psExtraArg->dfXSize;
     608      230598 :         dfYSize = psExtraArg->dfYSize;
     609             :     }
     610             : 
     611             :     /* -------------------------------------------------------------------- */
     612             :     /*      Compute stepping increment.                                     */
     613             :     /* -------------------------------------------------------------------- */
     614      565595 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     615      565595 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     616      565595 :     CPLErr eErr = CE_None;
     617             : 
     618      565595 :     if (eRWFlag == GF_Write)
     619             :     {
     620             :         /* --------------------------------------------------------------------
     621             :          */
     622             :         /*    Write case */
     623             :         /*    Loop over raster window computing source locations in the buffer.
     624             :          */
     625             :         /* --------------------------------------------------------------------
     626             :          */
     627      166655 :         GByte *pabyDstBlock = nullptr;
     628      166655 :         int nLBlockX = -1;
     629      166655 :         int nLBlockY = -1;
     630             : 
     631     1260010 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     632             :         {
     633     1093360 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     634             : 
     635    12384200 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     636             :             {
     637    11290800 :                 const int iBufXOff =
     638    11290800 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     639    11290800 :                 GPtrDiff_t iBufOffset =
     640    11290800 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     641             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     642    11290800 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     643             : 
     644             :                 // FIXME: this code likely doesn't work if the dirty block gets
     645             :                 // flushed to disk before being completely written.
     646             :                 // In the meantime, bJustInitialize should probably be set to
     647             :                 // FALSE even if it is not ideal performance wise, and for
     648             :                 // lossy compression.
     649             : 
     650             :                 /* --------------------------------------------------------------------
     651             :                  */
     652             :                 /*      Ensure we have the appropriate block loaded. */
     653             :                 /* --------------------------------------------------------------------
     654             :                  */
     655    11290800 :                 if (iDstX < nLBlockX * nBlockXSize ||
     656    11041500 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     657    10584800 :                     iDstY < nLBlockY * nBlockYSize ||
     658    10584800 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     659             :                 {
     660      738702 :                     nLBlockX = iDstX / nBlockXSize;
     661      738702 :                     nLBlockY = iDstY / nBlockYSize;
     662             : 
     663      738702 :                     const bool bJustInitialize =
     664     1065990 :                         nYOff <= nLBlockY * nBlockYSize &&
     665      327291 :                         nYOff + nYSize - nBlockYSize >=
     666      327291 :                             nLBlockY * nBlockYSize &&
     667     1116320 :                         nXOff <= nLBlockX * nBlockXSize &&
     668       50325 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     669             :                     /*bool bMemZeroBuffer = FALSE;
     670             :                     if( !bJustInitialize &&
     671             :                         nXOff <= nLBlockX * nBlockXSize &&
     672             :                         nYOff <= nLBlockY * nBlockYSize &&
     673             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     674             :                          (nXOff + nXSize == GetXSize() &&
     675             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     676             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     677             :                          (nYOff + nYSize == GetYSize() &&
     678             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     679             :                     {
     680             :                         bJustInitialize = TRUE;
     681             :                         bMemZeroBuffer = TRUE;
     682             :                     }*/
     683      738702 :                     if (poBlock != nullptr)
     684      572047 :                         poBlock->DropLock();
     685             : 
     686      738702 :                     poBlock =
     687      738702 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     688      738702 :                     if (poBlock == nullptr)
     689             :                     {
     690           0 :                         return (CE_Failure);
     691             :                     }
     692             : 
     693      738702 :                     poBlock->MarkDirty();
     694             : 
     695      738702 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     696             :                     /*if( bMemZeroBuffer )
     697             :                     {
     698             :                         memset(pabyDstBlock, 0,
     699             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     700             :                     * nBlockYSize);
     701             :                     }*/
     702             :                 }
     703             : 
     704             :                 // To make Coverity happy. Should not happen by design.
     705    11290800 :                 if (pabyDstBlock == nullptr)
     706             :                 {
     707           0 :                     CPLAssert(false);
     708             :                     eErr = CE_Failure;
     709             :                     break;
     710             :                 }
     711             : 
     712             :                 /* --------------------------------------------------------------------
     713             :                  */
     714             :                 /*      Copy over this pixel of data. */
     715             :                 /* --------------------------------------------------------------------
     716             :                  */
     717    11290800 :                 GPtrDiff_t iDstOffset =
     718    11290800 :                     (static_cast<GPtrDiff_t>(iDstX) -
     719    11290800 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     720    11290800 :                      (static_cast<GPtrDiff_t>(iDstY) -
     721    11290800 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     722    11290800 :                          nBlockXSize) *
     723    11290800 :                     nBandDataSize;
     724             : 
     725    11290800 :                 if (eDataType == eBufType)
     726             :                 {
     727    11287700 :                     memcpy(pabyDstBlock + iDstOffset,
     728    11287700 :                            static_cast<GByte *>(pData) + iBufOffset,
     729             :                            nBandDataSize);
     730             :                 }
     731             :                 else
     732             :                 {
     733             :                     /* type to type conversion ... ouch, this is expensive way
     734             :                     of handling single words */
     735        3096 :                     GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
     736        3096 :                                     eBufType, 0, pabyDstBlock + iDstOffset,
     737             :                                     eDataType, 0, 1);
     738             :                 }
     739             :             }
     740             : 
     741     1093360 :             if (psExtraArg->pfnProgress != nullptr &&
     742           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     743             :                                          psExtraArg->pProgressData))
     744             :             {
     745           0 :                 eErr = CE_Failure;
     746           0 :                 break;
     747             :             }
     748             :         }
     749             :     }
     750             :     else
     751             :     {
     752      398940 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     753             :         {
     754        9499 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     755        2719 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     756        2681 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     757        6821 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     758        3169 :                 GetColorTable() != nullptr)
     759             :             {
     760           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     761             :                          "Resampling method not supported on paletted band. "
     762             :                          "Falling back to nearest neighbour");
     763             :             }
     764        3393 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     765           3 :                      GDALDataTypeIsComplex(eDataType))
     766             :             {
     767           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     768             :                          "Resampling method not supported on complex data type "
     769             :                          "band. Falling back to nearest neighbour");
     770             :             }
     771             :             else
     772             :             {
     773        3390 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     774             :                                          pData, nBufXSize, nBufYSize, eBufType,
     775        3390 :                                          nPixelSpace, nLineSpace, psExtraArg);
     776             :             }
     777             :         }
     778             : 
     779      395548 :         int nLimitBlockY = 0;
     780      395548 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     781      395548 :         int nStartBlockX = -nBlockXSize;
     782      395548 :         const double EPS = 1e-10;
     783      395548 :         int nLBlockY = -1;
     784      395548 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     785      395548 :         const bool bIntegerXFactor =
     786      372870 :             bUseIntegerRequestCoords &&
     787      669377 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     788      273829 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     789             : 
     790             :         /* --------------------------------------------------------------------
     791             :          */
     792             :         /*      Read case */
     793             :         /*      Loop over buffer computing source locations. */
     794             :         /* --------------------------------------------------------------------
     795             :          */
     796     2457760 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     797             :         {
     798             :             // Add small epsilon to avoid some numeric precision issues.
     799     2062220 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     800     2062220 :             const int iSrcY = static_cast<int>(std::min(
     801     2062220 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     802             : 
     803     2062220 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     804             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     805             : 
     806     2062220 :             if (iSrcY >= nLimitBlockY)
     807             :             {
     808      433795 :                 nLBlockY = iSrcY / nBlockYSize;
     809      433795 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     810      433795 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     811      433795 :                     nLimitBlockY += nBlockYSize;
     812             :                 else
     813           0 :                     nLimitBlockY = INT_MAX;
     814             :                 // Make sure a new block is loaded.
     815      433795 :                 nStartBlockX = -nBlockXSize;
     816             :             }
     817     1628430 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     818             :             {
     819             :                 // Make sure a new block is loaded.
     820      441987 :                 nStartBlockX = -nBlockXSize;
     821             :             }
     822             : 
     823     2062220 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     824     2062220 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     825             : 
     826     2062220 :             if (bIntegerXFactor)
     827             :             {
     828      695780 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     829      695780 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     830      695780 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     831      695780 :                 bool bRet = false;
     832      695780 :                 if (bByteCopy)
     833             :                 {
     834      585773 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     835             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     836             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_Byte,
     837             :                         GDT_Byte, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     838             :                 }
     839      110007 :                 else if (eDataType == eBufType)
     840             :                 {
     841      109782 :                     switch (nBandDataSize)
     842             :                     {
     843      109630 :                         case 2:
     844      109630 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     845             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     846             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     847             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     848             :                                 nBlockXSize, poBlock, nLBlockY);
     849      109630 :                             break;
     850          54 :                         case 4:
     851          54 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     852             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     853             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     854             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     855             :                                 nBlockXSize, poBlock, nLBlockY);
     856          54 :                             break;
     857          96 :                         case 8:
     858          96 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     859             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     860             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     861             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     862             :                                 nBlockXSize, poBlock, nLBlockY);
     863          96 :                             break;
     864           2 :                         case 16:
     865           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     866             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     867             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     868             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     869             :                                 nBlockXSize, poBlock, nLBlockY);
     870           2 :                             break;
     871           0 :                         default:
     872           0 :                             CPLAssert(false);
     873             :                             break;
     874             :                     }
     875             :                 }
     876             :                 else
     877             :                 {
     878         225 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     879             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     880             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     881             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     882             :                 }
     883      695780 :                 if (!bRet)
     884           1 :                     eErr = CE_Failure;
     885             :             }
     886             :             else
     887             :             {
     888     1366440 :                 double dfSrcX = dfSrcXStart;
     889   590688000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     890   589322000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     891             :                 {
     892             :                     // TODO?: try to avoid the clamping for most iterations
     893             :                     const int iSrcX = static_cast<int>(
     894  1178640000 :                         std::min(std::max(0.0, dfSrcX),
     895   589322000 :                                  static_cast<double>(nRasterXSize - 1)));
     896             : 
     897             :                     /* --------------------------------------------------------------------
     898             :                      */
     899             :                     /*      Ensure we have the appropriate block loaded. */
     900             :                     /* --------------------------------------------------------------------
     901             :                      */
     902   589322000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     903             :                     {
     904     1702870 :                         const int nLBlockX = iSrcX / nBlockXSize;
     905     1702870 :                         nStartBlockX = nLBlockX * nBlockXSize;
     906             : 
     907     1702870 :                         if (poBlock != nullptr)
     908     1581150 :                             poBlock->DropLock();
     909             : 
     910     1702870 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     911     1702870 :                         if (poBlock == nullptr)
     912             :                         {
     913           9 :                             eErr = CE_Failure;
     914           9 :                             break;
     915             :                         }
     916             : 
     917             :                         pabySrcBlock =
     918     1702860 :                             static_cast<GByte *>(poBlock->GetDataRef());
     919             :                     }
     920   589322000 :                     const GPtrDiff_t nDiffX =
     921   589322000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     922             : 
     923             :                     /* --------------------------------------------------------------------
     924             :                      */
     925             :                     /*      Copy over this pixel of data. */
     926             :                     /* --------------------------------------------------------------------
     927             :                      */
     928             : 
     929   589322000 :                     if (bByteCopy)
     930             :                     {
     931   533523000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     932   533523000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     933   533523000 :                             pabySrcBlock[iSrcOffset];
     934             :                     }
     935    55798800 :                     else if (eDataType == eBufType)
     936             :                     {
     937    50322800 :                         GPtrDiff_t iSrcOffset =
     938    50322800 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     939    50322800 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     940    50322800 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     941             :                     }
     942             :                     else
     943             :                     {
     944             :                         // Type to type conversion ...
     945     5476050 :                         GPtrDiff_t iSrcOffset =
     946     5476050 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     947     5476050 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
     948             :                                         static_cast<GByte *>(pData) +
     949     5476050 :                                             iBufOffset,
     950             :                                         eBufType, 0, 1);
     951             :                     }
     952             : 
     953   589322000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     954             :                 }
     955             :             }
     956     2062220 :             if (eErr == CE_Failure)
     957          11 :                 break;
     958             : 
     959     2296370 :             if (psExtraArg->pfnProgress != nullptr &&
     960      234158 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     961             :                                          psExtraArg->pProgressData))
     962             :             {
     963           1 :                 eErr = CE_Failure;
     964           1 :                 break;
     965             :             }
     966             :         }
     967             :     }
     968             : 
     969      562203 :     if (poBlock != nullptr)
     970      562193 :         poBlock->DropLock();
     971             : 
     972      562203 :     return eErr;
     973             : }
     974             : 
     975             : /************************************************************************/
     976             : /*                         GDALRasterIOTransformer()                    */
     977             : /************************************************************************/
     978             : 
     979             : struct GDALRasterIOTransformerStruct
     980             : {
     981             :     double dfXOff;
     982             :     double dfYOff;
     983             :     double dfXRatioDstToSrc;
     984             :     double dfYRatioDstToSrc;
     985             : };
     986             : 
     987        6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
     988             :                                    int nPointCount, double *x, double *y,
     989             :                                    double * /* z */, int *panSuccess)
     990             : {
     991        6748 :     GDALRasterIOTransformerStruct *psParams =
     992             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
     993        6748 :     if (bDstToSrc)
     994             :     {
     995      252996 :         for (int i = 0; i < nPointCount; i++)
     996             :         {
     997      246836 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
     998      246836 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
     999      246836 :             panSuccess[i] = TRUE;
    1000             :         }
    1001             :     }
    1002             :     else
    1003             :     {
    1004        1176 :         for (int i = 0; i < nPointCount; i++)
    1005             :         {
    1006         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
    1007         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
    1008         588 :             panSuccess[i] = TRUE;
    1009             :         }
    1010             :     }
    1011        6748 :     return TRUE;
    1012             : }
    1013             : 
    1014             : /************************************************************************/
    1015             : /*                          RasterIOResampled()                         */
    1016             : /************************************************************************/
    1017             : 
    1018             : //! @cond Doxygen_Suppress
    1019        3390 : CPLErr GDALRasterBand::RasterIOResampled(
    1020             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1021             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1022             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1023             : {
    1024             :     // Determine if we use warping resampling or overview resampling
    1025             :     const bool bUseWarp =
    1026        3390 :         (GDALDataTypeIsComplex(eDataType) &&
    1027        3549 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1028         159 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1029             : 
    1030        3390 :     double dfXOff = nXOff;
    1031        3390 :     double dfYOff = nYOff;
    1032        3390 :     double dfXSize = nXSize;
    1033        3390 :     double dfYSize = nYSize;
    1034        3390 :     if (psExtraArg->bFloatingPointWindowValidity)
    1035             :     {
    1036        2695 :         dfXOff = psExtraArg->dfXOff;
    1037        2695 :         dfYOff = psExtraArg->dfYOff;
    1038        2695 :         dfXSize = psExtraArg->dfXSize;
    1039        2695 :         dfYSize = psExtraArg->dfYSize;
    1040             :     }
    1041             : 
    1042        3390 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1043        3390 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1044             : 
    1045             :     // Determine the coordinates in the "virtual" output raster to see
    1046             :     // if there are not integers, in which case we will use them as a shift
    1047             :     // so that subwindow extracts give the exact same results as entire raster
    1048             :     // scaling.
    1049        3390 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1050        3390 :     bool bHasXOffVirtual = false;
    1051        3390 :     int nDestXOffVirtual = 0;
    1052        3390 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1053             :     {
    1054        3062 :         bHasXOffVirtual = true;
    1055        3062 :         dfXOff = nXOff;
    1056        3062 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1057             :     }
    1058             : 
    1059        3390 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1060        3390 :     bool bHasYOffVirtual = false;
    1061        3390 :     int nDestYOffVirtual = 0;
    1062        3390 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1063             :     {
    1064        3058 :         bHasYOffVirtual = true;
    1065        3058 :         dfYOff = nYOff;
    1066        3058 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1067             :     }
    1068             : 
    1069             :     // Create a MEM dataset that wraps the output buffer.
    1070             :     GDALDataset *poMEMDS;
    1071        3390 :     void *pTempBuffer = nullptr;
    1072        3390 :     GSpacing nPSMem = nPixelSpace;
    1073        3390 :     GSpacing nLSMem = nLineSpace;
    1074        3390 :     void *pDataMem = pData;
    1075        3390 :     GDALDataType eDTMem = eBufType;
    1076        3390 :     if (eBufType != eDataType)
    1077             :     {
    1078          44 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1079          44 :         nLSMem = nPSMem * nBufXSize;
    1080             :         pTempBuffer =
    1081          44 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1082          44 :         if (pTempBuffer == nullptr)
    1083           0 :             return CE_Failure;
    1084          44 :         pDataMem = pTempBuffer;
    1085          44 :         eDTMem = eDataType;
    1086             :     }
    1087             : 
    1088             :     poMEMDS =
    1089        3390 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1090             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1091        3390 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1092        3390 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1093        3390 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1094             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1095        3390 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1096             : 
    1097        3390 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1098        3390 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1099        3390 :     if (pszNBITS)
    1100           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1101           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1102             : 
    1103        3390 :     CPLErr eErr = CE_None;
    1104             : 
    1105             :     // Do the resampling.
    1106        3390 :     if (bUseWarp)
    1107             :     {
    1108         149 :         int bHasNoData = FALSE;
    1109         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1110             : 
    1111         149 :         VRTDatasetH hVRTDS = nullptr;
    1112         149 :         GDALRasterBandH hVRTBand = nullptr;
    1113         149 :         if (GetDataset() == nullptr)
    1114             :         {
    1115             :             /* Create VRT dataset that wraps the whole dataset */
    1116           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1117           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1118           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1119           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1120             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1121             :                                VRT_NODATA_UNSET);
    1122             : 
    1123             :             /* Add a mask band if needed */
    1124           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1125             :             {
    1126           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1127             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1128             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1129             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1130           0 :                             ->GetMaskBand());
    1131           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1132           0 :                                                  nRasterYSize, 0, 0,
    1133           0 :                                                  nRasterXSize, nRasterYSize);
    1134             :             }
    1135             :         }
    1136             : 
    1137         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1138         149 :         switch (psExtraArg->eResampleAlg)
    1139             :         {
    1140           0 :             case GRIORA_NearestNeighbour:
    1141           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1142           0 :                 break;
    1143         147 :             case GRIORA_Bilinear:
    1144         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1145         147 :                 break;
    1146           0 :             case GRIORA_Cubic:
    1147           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1148           0 :                 break;
    1149           0 :             case GRIORA_CubicSpline:
    1150           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1151           0 :                 break;
    1152           0 :             case GRIORA_Lanczos:
    1153           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1154           0 :                 break;
    1155           0 :             case GRIORA_Average:
    1156           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1157           0 :                 break;
    1158           2 :             case GRIORA_RMS:
    1159           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1160           2 :                 break;
    1161           0 :             case GRIORA_Mode:
    1162           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1163           0 :                 break;
    1164           0 :             default:
    1165           0 :                 CPLAssert(false);
    1166             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1167             :                 break;
    1168             :         }
    1169         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1170         149 :         psWarpOptions->hDstDS = poMEMDS;
    1171         149 :         psWarpOptions->nBandCount = 1;
    1172         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1173         149 :         int nDstBandNumber = 1;
    1174         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1175         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1176         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1177         149 :                                          ? psExtraArg->pfnProgress
    1178             :                                          : GDALDummyProgress;
    1179         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1180         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1181         149 :         if (bHasNoData)
    1182             :         {
    1183           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1184             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1185           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1186             :             {
    1187           0 :                 psWarpOptions->padfSrcNoDataReal =
    1188           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1189           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1190             :             }
    1191             : 
    1192           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1193             :             {
    1194           0 :                 psWarpOptions->padfDstNoDataReal =
    1195           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1196           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1197             :             }
    1198             :         }
    1199             : 
    1200             :         GDALRasterIOTransformerStruct sTransformer;
    1201         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1202         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1203         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1204         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1205         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1206             : 
    1207             :         GDALWarpOperationH hWarpOperation =
    1208         149 :             GDALCreateWarpOperation(psWarpOptions);
    1209         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1210             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1211         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1212             : 
    1213         149 :         psWarpOptions->panSrcBands = nullptr;
    1214         149 :         psWarpOptions->panDstBands = nullptr;
    1215         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1216             : 
    1217         149 :         if (hVRTDS)
    1218           0 :             GDALClose(hVRTDS);
    1219             :     }
    1220             :     else
    1221             :     {
    1222        3241 :         const char *pszResampling =
    1223        4210 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1224        1267 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1225         558 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1226         479 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1227         342 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1228         199 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1229          79 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1230           3 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1231             :                                                                : "UNKNOWN";
    1232             : 
    1233        3241 :         int nKernelRadius = 0;
    1234             :         GDALResampleFunction pfnResampleFunc =
    1235        3241 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1236        3241 :         CPLAssert(pfnResampleFunc);
    1237             :         GDALDataType eWrkDataType =
    1238        3241 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1239        3241 :         int nHasNoData = 0;
    1240        3241 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1241        3241 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1242        3241 :         if (!bHasNoData)
    1243        3151 :             dfNoDataValue = 0.0;
    1244             : 
    1245        3241 :         int nDstBlockXSize = nBufXSize;
    1246        3241 :         int nDstBlockYSize = nBufYSize;
    1247        3241 :         int nFullResXChunk = 0;
    1248        3241 :         int nFullResYChunk = 0;
    1249             :         while (true)
    1250             :         {
    1251        3252 :             nFullResXChunk =
    1252        3252 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1253        3252 :             nFullResYChunk =
    1254        3252 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1255        3252 :             if (nFullResXChunk > nRasterXSize)
    1256        2909 :                 nFullResXChunk = nRasterXSize;
    1257        3252 :             if (nFullResYChunk > nRasterYSize)
    1258         510 :                 nFullResYChunk = nRasterYSize;
    1259        3252 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1260        3194 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1261             :                  1024 * 1024))
    1262             :                 break;
    1263             :             // When operating on the full width of a raster whose block width is
    1264             :             // the raster width, prefer doing chunks in height.
    1265          11 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1266             :                 nDstBlockYSize > 1)
    1267           0 :                 nDstBlockYSize /= 2;
    1268             :             /* Otherwise cut the maximal dimension */
    1269          11 :             else if (nDstBlockXSize > 1 &&
    1270           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1271          11 :                 nDstBlockXSize /= 2;
    1272             :             else
    1273           0 :                 nDstBlockYSize /= 2;
    1274             :         }
    1275             : 
    1276        3241 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1277        3241 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1278        3241 :         if (nOvrXFactor == 0)
    1279        2029 :             nOvrXFactor = 1;
    1280        3241 :         if (nOvrYFactor == 0)
    1281        2028 :             nOvrYFactor = 1;
    1282        3241 :         int nFullResXSizeQueried =
    1283        3241 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1284        3241 :         int nFullResYSizeQueried =
    1285        3241 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1286             : 
    1287        3241 :         if (nFullResXSizeQueried > nRasterXSize)
    1288        2699 :             nFullResXSizeQueried = nRasterXSize;
    1289        3241 :         if (nFullResYSizeQueried > nRasterYSize)
    1290         297 :             nFullResYSizeQueried = nRasterYSize;
    1291             : 
    1292             :         void *pChunk =
    1293        3241 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1294             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1295        3241 :         GByte *pabyChunkNoDataMask = nullptr;
    1296             : 
    1297        3241 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1298        3241 :         int l_nMaskFlags = GetMaskFlags();
    1299             : 
    1300        3241 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1301        3241 :         if (bUseNoDataMask)
    1302             :         {
    1303         158 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1304             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1305             :         }
    1306        3241 :         if (pChunk == nullptr ||
    1307         158 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1308             :         {
    1309           0 :             GDALClose(poMEMDS);
    1310           0 :             CPLFree(pChunk);
    1311           0 :             CPLFree(pabyChunkNoDataMask);
    1312           0 :             VSIFree(pTempBuffer);
    1313           0 :             return CE_Failure;
    1314             :         }
    1315             : 
    1316        3241 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1317        3241 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1318        3241 :         int nBlocksDone = 0;
    1319             : 
    1320             :         int nDstYOff;
    1321        6482 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1322        3241 :              nDstYOff += nDstBlockYSize)
    1323             :         {
    1324             :             int nDstYCount;
    1325        3241 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1326        3241 :                 nDstYCount = nDstBlockYSize;
    1327             :             else
    1328           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1329             : 
    1330        3241 :             int nChunkYOff =
    1331        3241 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1332        3241 :             int nChunkYOff2 = nYOff + 1 +
    1333        3241 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1334             :                                                     dfYRatioDstToSrc));
    1335        3241 :             if (nChunkYOff2 > nRasterYSize)
    1336         652 :                 nChunkYOff2 = nRasterYSize;
    1337        3241 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1338        3241 :             CPLAssert(nYCount <= nFullResYChunk);
    1339             : 
    1340        3241 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1341        3241 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1342        3241 :             if (nChunkYOffQueried < 0)
    1343             :             {
    1344         450 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1345         450 :                 nChunkYOffQueried = 0;
    1346             :             }
    1347        3241 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1348         553 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1349        3241 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1350             : 
    1351        3241 :             int nDstXOff = 0;
    1352        6482 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1353        3241 :                  nDstXOff += nDstBlockXSize)
    1354             :             {
    1355        3241 :                 int nDstXCount = 0;
    1356        3241 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1357        3241 :                     nDstXCount = nDstBlockXSize;
    1358             :                 else
    1359           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1360             : 
    1361        3241 :                 int nChunkXOff =
    1362        3241 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1363        3241 :                 int nChunkXOff2 =
    1364        3241 :                     nXOff + 1 +
    1365        3241 :                     static_cast<int>(
    1366        3241 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1367        3241 :                 if (nChunkXOff2 > nRasterXSize)
    1368        2952 :                     nChunkXOff2 = nRasterXSize;
    1369        3241 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1370        3241 :                 CPLAssert(nXCount <= nFullResXChunk);
    1371             : 
    1372        3241 :                 int nChunkXOffQueried =
    1373        3241 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1374        3241 :                 int nChunkXSizeQueried =
    1375        3241 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1376        3241 :                 if (nChunkXOffQueried < 0)
    1377             :                 {
    1378        2754 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1379        2754 :                     nChunkXOffQueried = 0;
    1380             :                 }
    1381        3241 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1382        2740 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1383        3241 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1384             : 
    1385             :                 // Read the source buffers.
    1386        3241 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1387             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1388             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1389             :                                 eWrkDataType, 0, 0, nullptr);
    1390             : 
    1391        3241 :                 bool bSkipResample = false;
    1392        3241 :                 bool bNoDataMaskFullyOpaque = false;
    1393        3241 :                 if (eErr == CE_None && bUseNoDataMask)
    1394             :                 {
    1395         158 :                     eErr = poMaskBand->RasterIO(
    1396             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1397             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1398             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1399             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1400             : 
    1401             :                     /* Optimizations if mask if fully opaque or transparent */
    1402         158 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1403         158 :                     GByte bVal = pabyChunkNoDataMask[0];
    1404         158 :                     int i = 1;
    1405     3751460 :                     for (; i < nPixels; i++)
    1406             :                     {
    1407     3751410 :                         if (pabyChunkNoDataMask[i] != bVal)
    1408         104 :                             break;
    1409             :                     }
    1410         158 :                     if (i == nPixels)
    1411             :                     {
    1412          54 :                         if (bVal == 0)
    1413             :                         {
    1414         712 :                             for (int j = 0; j < nDstYCount; j++)
    1415             :                             {
    1416         686 :                                 GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
    1417             :                                                 static_cast<GByte *>(pDataMem) +
    1418         686 :                                                     nLSMem * (j + nDstYOff) +
    1419         686 :                                                     nDstXOff * nPSMem,
    1420             :                                                 eDTMem,
    1421             :                                                 static_cast<int>(nPSMem),
    1422             :                                                 nDstXCount);
    1423             :                             }
    1424          26 :                             bSkipResample = true;
    1425             :                         }
    1426             :                         else
    1427             :                         {
    1428          28 :                             bNoDataMaskFullyOpaque = true;
    1429             :                         }
    1430             :                     }
    1431             :                 }
    1432             : 
    1433        3241 :                 if (!bSkipResample && eErr == CE_None)
    1434             :                 {
    1435        3212 :                     const bool bPropagateNoData = false;
    1436        3212 :                     void *pDstBuffer = nullptr;
    1437        3212 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1438             :                     GDALRasterBand *poMEMBand =
    1439        3212 :                         GDALRasterBand::FromHandle(hMEMBand);
    1440        3212 :                     GDALOverviewResampleArgs args;
    1441        3212 :                     args.eSrcDataType = eDataType;
    1442        3212 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1443        3212 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1444        3212 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1445        3212 :                     args.nOvrNBITS = nNBITS;
    1446        3212 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1447        3212 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1448        3212 :                     args.dfSrcXDelta =
    1449        3212 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1450        3212 :                     args.dfSrcYDelta =
    1451        3212 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1452        3212 :                     args.eWrkDataType = eWrkDataType;
    1453        3212 :                     args.pabyChunkNodataMask =
    1454        3212 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1455        3212 :                     args.nChunkXOff =
    1456        3212 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1457        3212 :                     args.nChunkXSize = nChunkXSizeQueried;
    1458        3212 :                     args.nChunkYOff =
    1459        3212 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1460        3212 :                     args.nChunkYSize = nChunkYSizeQueried;
    1461        3212 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1462        3212 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1463        3212 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1464        3212 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1465        3212 :                     args.pszResampling = pszResampling;
    1466        3212 :                     args.bHasNoData = bHasNoData;
    1467        3212 :                     args.dfNoDataValue = dfNoDataValue;
    1468        3212 :                     args.poColorTable = GetColorTable();
    1469        3212 :                     args.bPropagateNoData = bPropagateNoData;
    1470        3212 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1471             :                                            &eDstBufferDataType);
    1472        3212 :                     if (eErr == CE_None)
    1473             :                     {
    1474        3212 :                         eErr = poMEMBand->RasterIO(
    1475             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1476             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1477             :                             pDstBuffer, nDstXCount, nDstYCount,
    1478             :                             eDstBufferDataType, 0, 0, nullptr);
    1479             :                     }
    1480        3212 :                     CPLFree(pDstBuffer);
    1481             :                 }
    1482             : 
    1483        3241 :                 nBlocksDone++;
    1484        3666 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1485         425 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1486             :                                              "", psExtraArg->pProgressData))
    1487             :                 {
    1488           1 :                     eErr = CE_Failure;
    1489             :                 }
    1490             :             }
    1491             :         }
    1492             : 
    1493        3241 :         CPLFree(pChunk);
    1494        3241 :         CPLFree(pabyChunkNoDataMask);
    1495             :     }
    1496             : 
    1497        3390 :     if (eBufType != eDataType)
    1498             :     {
    1499          44 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1500             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1501             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1502             :             nullptr));
    1503             :     }
    1504        3390 :     GDALClose(poMEMDS);
    1505        3390 :     VSIFree(pTempBuffer);
    1506             : 
    1507        3390 :     return eErr;
    1508             : }
    1509             : 
    1510             : /************************************************************************/
    1511             : /*                          RasterIOResampled()                         */
    1512             : /************************************************************************/
    1513             : 
    1514         865 : CPLErr GDALDataset::RasterIOResampled(
    1515             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1516             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1517             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1518             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1519             : 
    1520             : {
    1521             : #if 0
    1522             :     // Determine if we use warping resampling or overview resampling
    1523             :     bool bUseWarp = false;
    1524             :     if( GDALDataTypeIsComplex( eDataType ) )
    1525             :         bUseWarp = true;
    1526             : #endif
    1527             : 
    1528         865 :     double dfXOff = nXOff;
    1529         865 :     double dfYOff = nYOff;
    1530         865 :     double dfXSize = nXSize;
    1531         865 :     double dfYSize = nYSize;
    1532         865 :     if (psExtraArg->bFloatingPointWindowValidity)
    1533             :     {
    1534         744 :         dfXOff = psExtraArg->dfXOff;
    1535         744 :         dfYOff = psExtraArg->dfYOff;
    1536         744 :         dfXSize = psExtraArg->dfXSize;
    1537         744 :         dfYSize = psExtraArg->dfYSize;
    1538             :     }
    1539             : 
    1540         865 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1541         865 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1542             : 
    1543             :     // Determine the coordinates in the "virtual" output raster to see
    1544             :     // if there are not integers, in which case we will use them as a shift
    1545             :     // so that subwindow extracts give the exact same results as entire raster
    1546             :     // scaling.
    1547         865 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1548         865 :     bool bHasXOffVirtual = false;
    1549         865 :     int nDestXOffVirtual = 0;
    1550         865 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1551             :     {
    1552         736 :         bHasXOffVirtual = true;
    1553         736 :         dfXOff = nXOff;
    1554         736 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1555             :     }
    1556             : 
    1557         865 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1558         865 :     bool bHasYOffVirtual = false;
    1559         865 :     int nDestYOffVirtual = 0;
    1560         865 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1561             :     {
    1562         697 :         bHasYOffVirtual = true;
    1563         697 :         dfYOff = nYOff;
    1564         697 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1565             :     }
    1566             : 
    1567             :     // Create a MEM dataset that wraps the output buffer.
    1568             :     GDALDataset *poMEMDS =
    1569         865 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1570             :                            nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
    1571             :     GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
    1572         862 :         CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
    1573         860 :     int nNBITS = 0;
    1574        2765 :     for (int i = 0; i < nBandCount; i++)
    1575             :     {
    1576        1909 :         char szBuffer[32] = {'\0'};
    1577        3830 :         int nRet = CPLPrintPointer(
    1578             :             szBuffer,
    1579        1909 :             static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
    1580        1909 :                 nLineSpace * nDestYOffVirtual + nBandSpace * i,
    1581             :             sizeof(szBuffer));
    1582        1921 :         szBuffer[nRet] = 0;
    1583             : 
    1584        1921 :         char szBuffer0[64] = {'\0'};
    1585        1921 :         snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
    1586             : 
    1587        1921 :         char szBuffer1[64] = {'\0'};
    1588        1921 :         snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
    1589             :                  static_cast<GIntBig>(nPixelSpace));
    1590             : 
    1591        1921 :         char szBuffer2[64] = {'\0'};
    1592        1921 :         snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
    1593             :                  static_cast<GIntBig>(nLineSpace));
    1594             : 
    1595        1921 :         char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
    1596             : 
    1597        1921 :         poMEMDS->AddBand(eBufType, apszOptions);
    1598             : 
    1599        1925 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1600        1911 :         papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
    1601             :         const char *pszNBITS =
    1602        1912 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1603        1903 :         if (pszNBITS)
    1604             :         {
    1605           0 :             nNBITS = atoi(pszNBITS);
    1606           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1607           0 :                                                            "IMAGE_STRUCTURE");
    1608             :         }
    1609             :     }
    1610             : 
    1611         856 :     CPLErr eErr = CE_None;
    1612             : 
    1613             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1614             :     // Looks like this code was initially added as disable by copying
    1615             :     // from RasterIO here:
    1616             :     // https://trac.osgeo.org/gdal/changeset/29572
    1617             : #if 0
    1618             :     // Do the resampling.
    1619             :     if( bUseWarp )
    1620             :     {
    1621             :         VRTDatasetH hVRTDS = nullptr;
    1622             :         GDALRasterBandH hVRTBand = nullptr;
    1623             :         if( GetDataset() == nullptr )
    1624             :         {
    1625             :             /* Create VRT dataset that wraps the whole dataset */
    1626             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1627             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1628             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1629             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1630             :                                 (GDALRasterBandH)this,
    1631             :                                 0, 0,
    1632             :                                 nRasterXSize, nRasterYSize,
    1633             :                                 0, 0,
    1634             :                                 nRasterXSize, nRasterYSize,
    1635             :                                 nullptr, VRT_NODATA_UNSET );
    1636             : 
    1637             :             /* Add a mask band if needed */
    1638             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1639             :             {
    1640             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1641             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1642             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1643             :                 poVRTMaskBand->
    1644             :                     AddMaskBandSource( this,
    1645             :                                     0, 0,
    1646             :                                     nRasterXSize, nRasterYSize,
    1647             :                                     0, 0,
    1648             :                                     nRasterXSize, nRasterYSize);
    1649             :             }
    1650             :         }
    1651             : 
    1652             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1653             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1654             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1655             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1656             :         psWarpOptions->nBandCount = 1;
    1657             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1658             :         int nDstBandNumber = 1;
    1659             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1660             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1661             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1662             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1663             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1664             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1665             :         GDALRasterIOTransformerStruct sTransformer;
    1666             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1667             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1668             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1669             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1670             :         psWarpOptions->pTransformerArg = &sTransformer;
    1671             : 
    1672             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1673             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1674             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1675             :                                       nBufXSize, nBufYSize );
    1676             :         GDALDestroyWarpOperation( hWarpOperation );
    1677             : 
    1678             :         psWarpOptions->panSrcBands = nullptr;
    1679             :         psWarpOptions->panDstBands = nullptr;
    1680             :         GDALDestroyWarpOptions( psWarpOptions );
    1681             : 
    1682             :         if( hVRTDS )
    1683             :             GDALClose(hVRTDS);
    1684             :     }
    1685             :     else
    1686             : #endif
    1687             :     {
    1688         856 :         const char *pszResampling =
    1689        1597 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1690         741 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1691           0 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1692           0 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1693           0 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1694           0 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1695           0 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1696           0 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1697             :                                                                : "UNKNOWN";
    1698             : 
    1699         856 :         GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1700         862 :         GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1701             :         int nBlockXSize, nBlockYSize;
    1702         855 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1703             : 
    1704             :         int nKernelRadius;
    1705             :         GDALResampleFunction pfnResampleFunc =
    1706         853 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1707         858 :         CPLAssert(pfnResampleFunc);
    1708             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1709             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1710             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1711             : #endif
    1712             :         GDALDataType eWrkDataType =
    1713         858 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1714             : 
    1715         856 :         int nDstBlockXSize = nBufXSize;
    1716         856 :         int nDstBlockYSize = nBufYSize;
    1717             :         int nFullResXChunk, nFullResYChunk;
    1718             :         while (true)
    1719             :         {
    1720         856 :             nFullResXChunk =
    1721         856 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1722         856 :             nFullResYChunk =
    1723         856 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1724         856 :             if (nFullResXChunk > nRasterXSize)
    1725         573 :                 nFullResXChunk = nRasterXSize;
    1726         856 :             if (nFullResYChunk > nRasterYSize)
    1727          50 :                 nFullResYChunk = nRasterYSize;
    1728         856 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1729         854 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1730             :                  1024 * 1024))
    1731             :                 break;
    1732             :             // When operating on the full width of a raster whose block width is
    1733             :             // the raster width, prefer doing chunks in height.
    1734           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1735             :                 nDstBlockYSize > 1)
    1736           0 :                 nDstBlockYSize /= 2;
    1737             :             /* Otherwise cut the maximal dimension */
    1738           0 :             else if (nDstBlockXSize > 1 &&
    1739           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1740           0 :                 nDstBlockXSize /= 2;
    1741             :             else
    1742           0 :                 nDstBlockYSize /= 2;
    1743             :         }
    1744             : 
    1745        1716 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1746         856 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1747         860 :         if (nOvrFactor == 0)
    1748          99 :             nOvrFactor = 1;
    1749         860 :         int nFullResXSizeQueried =
    1750         860 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1751         860 :         int nFullResYSizeQueried =
    1752         860 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1753             : 
    1754         860 :         if (nFullResXSizeQueried > nRasterXSize)
    1755         598 :             nFullResXSizeQueried = nRasterXSize;
    1756         860 :         if (nFullResYSizeQueried > nRasterYSize)
    1757          53 :             nFullResYSizeQueried = nRasterYSize;
    1758             : 
    1759         860 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1760             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1761             :                               nBandCount),
    1762             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1763         864 :         GByte *pabyChunkNoDataMask = nullptr;
    1764             : 
    1765         864 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1766         865 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1767             : 
    1768         861 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1769         861 :         if (bUseNoDataMask)
    1770             :         {
    1771         596 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1772             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1773             :         }
    1774         861 :         if (pChunk == nullptr ||
    1775         596 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1776             :         {
    1777           3 :             GDALClose(poMEMDS);
    1778           0 :             CPLFree(pChunk);
    1779           0 :             CPLFree(pabyChunkNoDataMask);
    1780           0 :             CPLFree(papoDstBands);
    1781           0 :             return CE_Failure;
    1782             :         }
    1783             : 
    1784         858 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1785         858 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1786         858 :         int nBlocksDone = 0;
    1787             : 
    1788             :         int nDstYOff;
    1789        1727 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1790         869 :              nDstYOff += nDstBlockYSize)
    1791             :         {
    1792             :             int nDstYCount;
    1793         856 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1794         857 :                 nDstYCount = nDstBlockYSize;
    1795             :             else
    1796           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1797             : 
    1798         856 :             int nChunkYOff =
    1799         856 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1800         856 :             int nChunkYOff2 = nYOff + 1 +
    1801         856 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1802             :                                                     dfYRatioDstToSrc));
    1803         856 :             if (nChunkYOff2 > nRasterYSize)
    1804         126 :                 nChunkYOff2 = nRasterYSize;
    1805         856 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1806         856 :             CPLAssert(nYCount <= nFullResYChunk);
    1807             : 
    1808         856 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1809         856 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1810         856 :             if (nChunkYOffQueried < 0)
    1811             :             {
    1812         129 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1813         129 :                 nChunkYOffQueried = 0;
    1814             :             }
    1815         856 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1816         144 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1817         856 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1818             : 
    1819             :             int nDstXOff;
    1820        1722 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1821         866 :                  nDstXOff += nDstBlockXSize)
    1822             :             {
    1823             :                 int nDstXCount;
    1824         853 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1825         853 :                     nDstXCount = nDstBlockXSize;
    1826             :                 else
    1827           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1828             : 
    1829         853 :                 int nChunkXOff =
    1830         853 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1831         853 :                 int nChunkXOff2 =
    1832         853 :                     nXOff + 1 +
    1833         853 :                     static_cast<int>(
    1834         853 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1835         853 :                 if (nChunkXOff2 > nRasterXSize)
    1836         628 :                     nChunkXOff2 = nRasterXSize;
    1837         853 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1838         853 :                 CPLAssert(nXCount <= nFullResXChunk);
    1839             : 
    1840         853 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1841         853 :                 int nChunkXSizeQueried =
    1842         853 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1843         853 :                 if (nChunkXOffQueried < 0)
    1844             :                 {
    1845         623 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1846         623 :                     nChunkXOffQueried = 0;
    1847             :                 }
    1848         853 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1849         632 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1850         853 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1851             : 
    1852         853 :                 bool bSkipResample = false;
    1853         853 :                 bool bNoDataMaskFullyOpaque = false;
    1854         853 :                 if (eErr == CE_None && bUseNoDataMask)
    1855             :                 {
    1856         596 :                     eErr = poMaskBand->RasterIO(
    1857             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1858             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1859             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1860             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1861             : 
    1862             :                     /* Optimizations if mask if fully opaque or transparent */
    1863         596 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1864         596 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1865         596 :                     int i = 1;  // Used after for.
    1866    40677800 :                     for (; i < nPixels; i++)
    1867             :                     {
    1868    40677300 :                         if (pabyChunkNoDataMask[i] != bVal)
    1869          72 :                             break;
    1870             :                     }
    1871         596 :                     if (i == nPixels)
    1872             :                     {
    1873         524 :                         if (bVal == 0)
    1874             :                         {
    1875         373 :                             GByte abyZero[16] = {0};
    1876         780 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1877             :                             {
    1878        3499 :                                 for (int j = 0; j < nDstYCount; j++)
    1879             :                                 {
    1880        3092 :                                     GDALCopyWords64(
    1881             :                                         abyZero, GDT_Byte, 0,
    1882             :                                         static_cast<GByte *>(pData) +
    1883        3092 :                                             iBand * nBandSpace +
    1884        3092 :                                             nLineSpace * (j + nDstYOff) +
    1885        3092 :                                             nDstXOff * nPixelSpace,
    1886             :                                         eBufType, static_cast<int>(nPixelSpace),
    1887             :                                         nDstXCount);
    1888             :                                 }
    1889             :                             }
    1890         373 :                             bSkipResample = true;
    1891             :                         }
    1892             :                         else
    1893             :                         {
    1894         151 :                             bNoDataMaskFullyOpaque = true;
    1895             :                         }
    1896             :                     }
    1897             :                 }
    1898             : 
    1899         853 :                 if (!bSkipResample && eErr == CE_None)
    1900             :                 {
    1901             :                     /* Read the source buffers */
    1902         477 :                     eErr = RasterIO(
    1903             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1904             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1905             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1906             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1907             :                 }
    1908             : 
    1909             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1910             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1911             :                     eErr == CE_None)
    1912             :                 {
    1913             :                     eErr = pfnResampleFuncMultiBands(
    1914             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1915             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1916             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1917             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1918             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1919             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1920             :                         nChunkXSizeQueried,
    1921             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1922             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1923             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1924             :                         nDstYOff + nDestYOffVirtual,
    1925             :                         nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
    1926             :                         pszResampling, FALSE /*bHasNoData*/,
    1927             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1928             :                         eDataType);
    1929             :                 }
    1930             :                 else
    1931             : #endif
    1932             :                 {
    1933             :                     size_t nChunkBandOffset =
    1934         863 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1935         863 :                         nChunkYSizeQueried *
    1936         863 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1937        2376 :                     for (int i = 0;
    1938        2376 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1939             :                          i++)
    1940             :                     {
    1941        1510 :                         const bool bPropagateNoData = false;
    1942        1510 :                         void *pDstBuffer = nullptr;
    1943        1510 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1944             :                         GDALRasterBand *poMEMBand =
    1945        1510 :                             poMEMDS->GetRasterBand(i + 1);
    1946        1509 :                         GDALOverviewResampleArgs args;
    1947        1509 :                         args.eSrcDataType = eDataType;
    1948        1509 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1949        1512 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1950        1510 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1951        1510 :                         args.nOvrNBITS = nNBITS;
    1952        1510 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1953        1510 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1954        1510 :                         args.dfSrcXDelta =
    1955        1510 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1956        1510 :                         args.dfSrcYDelta =
    1957        1510 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1958        1510 :                         args.eWrkDataType = eWrkDataType;
    1959        1510 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1960        1510 :                                                        ? nullptr
    1961             :                                                        : pabyChunkNoDataMask;
    1962        1510 :                         args.nChunkXOff =
    1963        1510 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1964        1510 :                         args.nChunkXSize = nChunkXSizeQueried;
    1965        1510 :                         args.nChunkYOff =
    1966        1510 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1967        1510 :                         args.nChunkYSize = nChunkYSizeQueried;
    1968        1510 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1969        1510 :                         args.nDstXOff2 =
    1970        1510 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1971        1510 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1972        1510 :                         args.nDstYOff2 =
    1973        1510 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1974        1510 :                         args.pszResampling = pszResampling;
    1975        1510 :                         args.bHasNoData = false;
    1976        1510 :                         args.dfNoDataValue = 0.0;
    1977        1510 :                         args.poColorTable = nullptr;
    1978        1510 :                         args.bPropagateNoData = bPropagateNoData;
    1979             : 
    1980             :                         eErr =
    1981        3023 :                             pfnResampleFunc(args,
    1982        1510 :                                             reinterpret_cast<GByte *>(pChunk) +
    1983        1510 :                                                 i * nChunkBandOffset,
    1984             :                                             &pDstBuffer, &eDstBufferDataType);
    1985        1513 :                         if (eErr == CE_None)
    1986             :                         {
    1987        1513 :                             eErr = poMEMBand->RasterIO(
    1988             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1989             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1990             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1991             :                                 eDstBufferDataType, 0, 0, nullptr);
    1992             :                         }
    1993        1513 :                         CPLFree(pDstBuffer);
    1994             :                     }
    1995             :                 }
    1996             : 
    1997         866 :                 nBlocksDone++;
    1998        1255 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1999         389 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    2000             :                                              "", psExtraArg->pProgressData))
    2001             :                 {
    2002           0 :                     eErr = CE_Failure;
    2003             :                 }
    2004             :             }
    2005             :         }
    2006             : 
    2007         871 :         CPLFree(pChunk);
    2008         865 :         CPLFree(pabyChunkNoDataMask);
    2009             :     }
    2010             : 
    2011         865 :     CPLFree(papoDstBands);
    2012         865 :     GDALClose(poMEMDS);
    2013             : 
    2014         865 :     return eErr;
    2015             : }
    2016             : 
    2017             : //! @endcond
    2018             : 
    2019             : /************************************************************************/
    2020             : /*                           GDALSwapWords()                            */
    2021             : /************************************************************************/
    2022             : 
    2023             : /**
    2024             :  * Byte swap words in-place.
    2025             :  *
    2026             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2027             :  * a memory array.  No assumption is made that the words being swapped are
    2028             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2029             :  * to determine if the current platform is big endian or little endian.  Use
    2030             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2031             :  * of a function call.
    2032             :  *
    2033             :  * @param pData pointer to start of data buffer.
    2034             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2035             :  * @param nWordCount the number of words to be swapped in this call.
    2036             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2037             :  * the next. For packed buffers this is the same as nWordSize.
    2038             :  */
    2039             : 
    2040      497137 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2041             :                                int nWordSkip)
    2042             : 
    2043             : {
    2044      497137 :     if (nWordCount > 0)
    2045      497137 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2046             : 
    2047      497137 :     GByte *pabyData = static_cast<GByte *>(pData);
    2048             : 
    2049      497137 :     switch (nWordSize)
    2050             :     {
    2051        7234 :         case 1:
    2052        7234 :             break;
    2053             : 
    2054      476903 :         case 2:
    2055      476903 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2056   228062000 :             for (int i = 0; i < nWordCount; i++)
    2057             :             {
    2058   227585000 :                 CPL_SWAP16PTR(pabyData);
    2059   227585000 :                 pabyData += nWordSkip;
    2060             :             }
    2061      476903 :             break;
    2062             : 
    2063       10574 :         case 4:
    2064       10574 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2065       10574 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2066             :             {
    2067    29140500 :                 for (int i = 0; i < nWordCount; i++)
    2068             :                 {
    2069    29130000 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2070             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2071    29130000 :                     pabyData += nWordSkip;
    2072       10571 :                 }
    2073             :             }
    2074             :             else
    2075             :             {
    2076           9 :                 for (int i = 0; i < nWordCount; i++)
    2077             :                 {
    2078           6 :                     CPL_SWAP32PTR(pabyData);
    2079           6 :                     pabyData += nWordSkip;
    2080             :                 }
    2081             :             }
    2082       10574 :             break;
    2083             : 
    2084        2426 :         case 8:
    2085        2426 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2086        2426 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2087             :             {
    2088     3356900 :                 for (int i = 0; i < nWordCount; i++)
    2089             :                 {
    2090     3354480 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2091             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2092     3354480 :                     pabyData += nWordSkip;
    2093        2425 :                 }
    2094             :             }
    2095             :             else
    2096             :             {
    2097           3 :                 for (int i = 0; i < nWordCount; i++)
    2098             :                 {
    2099           2 :                     CPL_SWAP64PTR(pabyData);
    2100           2 :                     pabyData += nWordSkip;
    2101             :                 }
    2102             :             }
    2103        2426 :             break;
    2104             : 
    2105           0 :         default:
    2106           0 :             CPLAssert(false);
    2107             :     }
    2108             : }
    2109             : 
    2110             : /************************************************************************/
    2111             : /*                           GDALSwapWordsEx()                          */
    2112             : /************************************************************************/
    2113             : 
    2114             : /**
    2115             :  * Byte swap words in-place.
    2116             :  *
    2117             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2118             :  * a memory array.  No assumption is made that the words being swapped are
    2119             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2120             :  * to determine if the current platform is big endian or little endian.  Use
    2121             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2122             :  * of a function call.
    2123             :  *
    2124             :  * @param pData pointer to start of data buffer.
    2125             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2126             :  * @param nWordCount the number of words to be swapped in this call.
    2127             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2128             :  * the next. For packed buffers this is the same as nWordSize.
    2129             :  */
    2130        6118 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2131             :                                  int nWordSkip)
    2132             : {
    2133        6118 :     GByte *pabyData = static_cast<GByte *>(pData);
    2134       12236 :     while (nWordCount)
    2135             :     {
    2136             :         // Pick-up a multiple of 8 as max chunk size.
    2137        6118 :         const int nWordCountSmall =
    2138        6118 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2139        6118 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2140        6118 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2141        6118 :         nWordCount -= nWordCountSmall;
    2142             :     }
    2143        6118 : }
    2144             : 
    2145             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2146             : namespace
    2147             : {
    2148             : 
    2149             : /************************************************************************/
    2150             : /*                           GDALCopyWordsT()                           */
    2151             : /************************************************************************/
    2152             : /**
    2153             :  * Template function, used to copy data from pSrcData into buffer
    2154             :  * pDstData, with stride nSrcPixelStride in the source data and
    2155             :  * stride nDstPixelStride in the destination data. This template can
    2156             :  * deal with the case where the input data type is real or complex and
    2157             :  * the output is real.
    2158             :  *
    2159             :  * @param pSrcData the source data buffer
    2160             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2161             :  *                      of interest.
    2162             :  * @param pDstData the destination buffer.
    2163             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2164             :  *                      interest.
    2165             :  * @param nWordCount the total number of pixel words to copy
    2166             :  *
    2167             :  * @code
    2168             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2169             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2170             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2171             :  * @endcode
    2172             :  * @note
    2173             :  * This is a private function, and should not be exposed outside of
    2174             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2175             :  */
    2176             : 
    2177             : template <class Tin, class Tout>
    2178    42360542 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2179             :                                          int nSrcPixelStride,
    2180             :                                          Tout *const CPL_RESTRICT pDstData,
    2181             :                                          int nDstPixelStride,
    2182             :                                          GPtrDiff_t nWordCount)
    2183             : {
    2184    42360542 :     decltype(nWordCount) nDstOffset = 0;
    2185             : 
    2186    42360542 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2187    42360542 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2188   518940496 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2189             :     {
    2190   476583393 :         const Tin tValue =
    2191   476583393 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2192   476583393 :         Tout *const pOutPixel =
    2193   476583393 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2194             : 
    2195   476583393 :         GDALCopyWord(tValue, *pOutPixel);
    2196             : 
    2197   476579793 :         nDstOffset += nDstPixelStride;
    2198             :     }
    2199    42356984 : }
    2200             : 
    2201             : template <class Tin, class Tout>
    2202    29698002 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2203             :                                         int nSrcPixelStride,
    2204             :                                         Tout *const CPL_RESTRICT pDstData,
    2205             :                                         int nDstPixelStride,
    2206             :                                         GPtrDiff_t nWordCount)
    2207             : {
    2208    29698002 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2209             :                           nWordCount);
    2210    29698002 : }
    2211             : 
    2212             : template <class Tin, class Tout>
    2213     5011188 : static void inline GDALCopyWordsT_8atatime(
    2214             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2215             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2216             :     GPtrDiff_t nWordCount)
    2217             : {
    2218     5011188 :     decltype(nWordCount) nDstOffset = 0;
    2219             : 
    2220     5011188 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2221     5011188 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2222     5011188 :     decltype(nWordCount) n = 0;
    2223     5011188 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2224             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2225             :     {
    2226    37031656 :         for (; n < nWordCount - 7; n += 8)
    2227             :         {
    2228    36552910 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2229    36552910 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2230    36552910 :             Tout *const pOutPixels =
    2231    36552910 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2232             : 
    2233    36552910 :             GDALCopy8Words(pInValues, pOutPixels);
    2234             : 
    2235    36550218 :             nDstOffset += 8 * nDstPixelStride;
    2236             :         }
    2237             :     }
    2238    10353591 :     for (; n < nWordCount; n++)
    2239             :     {
    2240     5342316 :         const Tin tValue =
    2241     5342316 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2242     5342316 :         Tout *const pOutPixel =
    2243     5342316 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2244             : 
    2245     5342316 :         GDALCopyWord(tValue, *pOutPixel);
    2246             : 
    2247     5345132 :         nDstOffset += nDstPixelStride;
    2248             :     }
    2249     5011285 : }
    2250             : 
    2251             : #ifdef HAVE_SSE2
    2252             : 
    2253             : template <class Tout>
    2254       39704 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2255             :                               int nSrcPixelStride,
    2256             :                               Tout *const CPL_RESTRICT pDstData,
    2257             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2258             : {
    2259             :     static_assert(std::is_integral<Tout>::value &&
    2260             :                       sizeof(Tout) == sizeof(uint16_t),
    2261             :                   "Bad Tout");
    2262       39704 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2263             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2264             :     {
    2265       33353 :         decltype(nWordCount) n = 0;
    2266       33353 :         const __m128i xmm_zero = _mm_setzero_si128();
    2267       33353 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2268             :             reinterpret_cast<GByte *>(pDstData);
    2269     1415717 :         for (; n < nWordCount - 15; n += 16)
    2270             :         {
    2271     1382364 :             __m128i xmm = _mm_loadu_si128(
    2272     1382364 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2273     1382364 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2274     1382364 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2275             :             _mm_storeu_si128(
    2276     1382364 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2277             :             _mm_storeu_si128(
    2278     1382364 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2279             :         }
    2280      109343 :         for (; n < nWordCount; n++)
    2281             :         {
    2282       75990 :             pDstData[n] = pSrcData[n];
    2283       33353 :         }
    2284             :     }
    2285             :     else
    2286             :     {
    2287        6351 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2288             :                               nDstPixelStride, nWordCount);
    2289             :     }
    2290       39704 : }
    2291             : 
    2292             : template <>
    2293       26970 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2294             :                                  int nSrcPixelStride,
    2295             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    2296             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2297             : {
    2298       26970 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2299             :                              nDstPixelStride, nWordCount);
    2300       26970 : }
    2301             : 
    2302             : template <>
    2303       12734 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2304             :                                  int nSrcPixelStride,
    2305             :                                  GInt16 *const CPL_RESTRICT pDstData,
    2306             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2307             : {
    2308       12734 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2309             :                              nDstPixelStride, nWordCount);
    2310       12734 : }
    2311             : 
    2312             : template <class Tout>
    2313    12834865 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2314             :                               int nSrcPixelStride,
    2315             :                               Tout *const CPL_RESTRICT pDstData,
    2316             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2317             : {
    2318             :     static_assert(std::is_integral<Tout>::value &&
    2319             :                       sizeof(Tout) == sizeof(uint32_t),
    2320             :                   "Bad Tout");
    2321    12834865 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2322             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2323             :     {
    2324     6278995 :         decltype(nWordCount) n = 0;
    2325     6278995 :         const __m128i xmm_zero = _mm_setzero_si128();
    2326     6278995 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2327             :             reinterpret_cast<GByte *>(pDstData);
    2328    69808200 :         for (; n < nWordCount - 15; n += 16)
    2329             :         {
    2330    63459645 :             __m128i xmm = _mm_loadu_si128(
    2331    63459645 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2332    63554645 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2333    63614245 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2334    63498745 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2335    63528145 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2336    63373345 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2337    63529245 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2338             :             _mm_storeu_si128(
    2339    63529245 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2340             :             _mm_storeu_si128(
    2341    63529245 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2342             :             _mm_storeu_si128(
    2343    63529245 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2344             :             _mm_storeu_si128(
    2345    63529245 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2346             :         }
    2347    14539179 :         for (; n < nWordCount; n++)
    2348             :         {
    2349     8190544 :             pDstData[n] = pSrcData[n];
    2350     6348615 :         }
    2351             :     }
    2352             :     else
    2353             :     {
    2354     6555920 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2355             :                               nDstPixelStride, nWordCount);
    2356             :     }
    2357    12899765 : }
    2358             : 
    2359             : template <>
    2360         465 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2361             :                                  int nSrcPixelStride,
    2362             :                                  GUInt32 *const CPL_RESTRICT pDstData,
    2363             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2364             : {
    2365         465 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2366             :                              nDstPixelStride, nWordCount);
    2367         465 : }
    2368             : 
    2369             : template <>
    2370    12834600 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2371             :                                  int nSrcPixelStride,
    2372             :                                  GInt32 *const CPL_RESTRICT pDstData,
    2373             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2374             : {
    2375    12834600 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2376             :                              nDstPixelStride, nWordCount);
    2377    12831100 : }
    2378             : 
    2379             : template <>
    2380     2475750 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2381             :                                  int nSrcPixelStride,
    2382             :                                  float *const CPL_RESTRICT pDstData,
    2383             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2384             : {
    2385     2475750 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2386             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2387             :     {
    2388      115070 :         decltype(nWordCount) n = 0;
    2389      115070 :         const __m128i xmm_zero = _mm_setzero_si128();
    2390      115070 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2391             :             reinterpret_cast<GByte *>(pDstData);
    2392     3323680 :         for (; n < nWordCount - 15; n += 16)
    2393             :         {
    2394     3208610 :             __m128i xmm = _mm_loadu_si128(
    2395     3208610 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2396     3208610 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2397     3208610 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2398     3208610 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2399     3208610 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2400     3208610 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2401     3208610 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2402     3208610 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2403     3208610 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2404     3208610 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2405     3208610 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2406     3208610 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2407             :                           xmm0_f);
    2408             :             _mm_storeu_ps(
    2409     3208610 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2410             :             _mm_storeu_ps(
    2411     3208610 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2412             :             _mm_storeu_ps(
    2413     3208610 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2414             :         }
    2415      501780 :         for (; n < nWordCount; n++)
    2416             :         {
    2417      386710 :             pDstData[n] = pSrcData[n];
    2418      115070 :         }
    2419             :     }
    2420             :     else
    2421             :     {
    2422     2360680 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2423             :                               nDstPixelStride, nWordCount);
    2424             :     }
    2425     2475750 : }
    2426             : 
    2427             : template <>
    2428      152377 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2429             :                                  int nSrcPixelStride,
    2430             :                                  double *const CPL_RESTRICT pDstData,
    2431             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2432             : {
    2433      152377 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2434             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2435             :     {
    2436      129021 :         decltype(nWordCount) n = 0;
    2437      129021 :         const __m128i xmm_zero = _mm_setzero_si128();
    2438      129021 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2439             :             reinterpret_cast<GByte *>(pDstData);
    2440     1431460 :         for (; n < nWordCount - 15; n += 16)
    2441             :         {
    2442     1302440 :             __m128i xmm = _mm_loadu_si128(
    2443     1302440 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2444     1302440 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2445     1302440 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2446     1302440 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2447     1302440 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2448     1302440 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2449     1302440 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2450             : 
    2451             : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
    2452             :             _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2453             :                              _mm256_cvtepi32_pd(xmm0));
    2454             :             _mm256_storeu_pd(
    2455             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2456             :                 _mm256_cvtepi32_pd(xmm1));
    2457             :             _mm256_storeu_pd(
    2458             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2459             :                 _mm256_cvtepi32_pd(xmm2));
    2460             :             _mm256_storeu_pd(
    2461             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2462             :                 _mm256_cvtepi32_pd(xmm3));
    2463             : #else
    2464     1302440 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2465     1302440 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2466     1302440 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2467     1302440 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2468     1302440 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2469     1302440 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2470     1302440 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2471     1302440 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2472     1302440 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2473     1302440 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2474     1302440 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2475     1302440 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2476             : 
    2477     1302440 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2478             :                           xmm0_low_d);
    2479             :             _mm_storeu_pd(
    2480     1302440 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2481             :                 xmm0_high_d);
    2482             :             _mm_storeu_pd(
    2483     1302440 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2484             :                 xmm1_low_d);
    2485             :             _mm_storeu_pd(
    2486     1302440 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2487             :                 xmm1_high_d);
    2488             :             _mm_storeu_pd(
    2489     1302440 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2490             :                 xmm2_low_d);
    2491             :             _mm_storeu_pd(
    2492     1302440 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2493             :                 xmm2_high_d);
    2494             :             _mm_storeu_pd(
    2495     1302440 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2496             :                 xmm3_low_d);
    2497             :             _mm_storeu_pd(
    2498     1302440 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2499             :                 xmm3_high_d);
    2500             : #endif
    2501             :         }
    2502      252479 :         for (; n < nWordCount; n++)
    2503             :         {
    2504      123458 :             pDstData[n] = pSrcData[n];
    2505      129021 :         }
    2506             :     }
    2507             :     else
    2508             :     {
    2509       23356 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2510             :                               nDstPixelStride, nWordCount);
    2511             :     }
    2512      152377 : }
    2513             : 
    2514             : template <>
    2515         147 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
    2516             :                                  int nSrcPixelStride,
    2517             :                                  int8_t *const CPL_RESTRICT pDstData,
    2518             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2519             : {
    2520         147 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2521             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2522             :     {
    2523         141 :         decltype(nWordCount) n = 0;
    2524         141 :         const __m128i xmm_127 = _mm_set1_epi8(127);
    2525         145 :         for (; n < nWordCount - 31; n += 32)
    2526             :         {
    2527           8 :             __m128i xmm0 = _mm_loadu_si128(
    2528           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2529           4 :             __m128i xmm1 = _mm_loadu_si128(
    2530           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2531           4 :             xmm0 = _mm_min_epu8(xmm0, xmm_127);
    2532           4 :             xmm1 = _mm_min_epu8(xmm1, xmm_127);
    2533           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2534           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2535             :                              xmm1);
    2536             :         }
    2537        2421 :         for (; n < nWordCount; n++)
    2538             :         {
    2539        2280 :             pDstData[n] =
    2540        2280 :                 pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
    2541         141 :         }
    2542             :     }
    2543             :     else
    2544             :     {
    2545           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2546             :                               nDstPixelStride, nWordCount);
    2547             :     }
    2548         147 : }
    2549             : 
    2550             : template <>
    2551          82 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
    2552             :                                  int nSrcPixelStride,
    2553             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2554             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2555             : {
    2556          82 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2557             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2558             :     {
    2559          56 :         decltype(nWordCount) n = 0;
    2560             : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
    2561          56 :         const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
    2562             : #endif
    2563         117 :         for (; n < nWordCount - 31; n += 32)
    2564             :         {
    2565         122 :             __m128i xmm0 = _mm_loadu_si128(
    2566          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2567          61 :             __m128i xmm1 = _mm_loadu_si128(
    2568          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2569             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2570             :             xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
    2571             :             xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
    2572             : #else
    2573          61 :             xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
    2574          61 :             xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
    2575          61 :             xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
    2576          61 :             xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
    2577          61 :             xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
    2578          61 :             xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
    2579             : #endif
    2580          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2581          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2582             :                              xmm1);
    2583             :         }
    2584         352 :         for (; n < nWordCount; n++)
    2585             :         {
    2586         296 :             pDstData[n] =
    2587         296 :                 pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
    2588          56 :         }
    2589             :     }
    2590             :     else
    2591             :     {
    2592          26 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2593             :                               nDstPixelStride, nWordCount);
    2594             :     }
    2595          82 : }
    2596             : 
    2597             : template <>
    2598        6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2599             :                                  int nSrcPixelStride,
    2600             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2601             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2602             : {
    2603        6037 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2604             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2605             :     {
    2606        5062 :         decltype(nWordCount) n = 0;
    2607             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2608             :         const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2609             : #else
    2610             :         // In SSE2, min_epu16 does not exist, so shift from
    2611             :         // UInt16 to SInt16 to be able to use min_epi16
    2612        5062 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2613        5062 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2614             : #endif
    2615       71888 :         for (; n < nWordCount - 15; n += 16)
    2616             :         {
    2617      133652 :             __m128i xmm0 = _mm_loadu_si128(
    2618       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2619       66826 :             __m128i xmm1 = _mm_loadu_si128(
    2620       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2621             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2622             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2623             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2624             : #else
    2625       66826 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2626       66826 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2627       66826 :             xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
    2628       66826 :             xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
    2629       66826 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2630       66826 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2631             : #endif
    2632       66826 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    2633       66826 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2634             :         }
    2635       16403 :         for (; n < nWordCount; n++)
    2636             :         {
    2637       11341 :             pDstData[n] =
    2638       11341 :                 pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
    2639        5062 :         }
    2640             :     }
    2641             :     else
    2642             :     {
    2643         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2644             :                               nDstPixelStride, nWordCount);
    2645             :     }
    2646        6037 : }
    2647             : 
    2648             : template <>
    2649          46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2650             :                                  int nSrcPixelStride,
    2651             :                                  int16_t *const CPL_RESTRICT pDstData,
    2652             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2653             : {
    2654          46 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2655             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2656             :     {
    2657          40 :         decltype(nWordCount) n = 0;
    2658             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2659             :         const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2660             : #else
    2661             :         // In SSE2, min_epu16 does not exist, so shift from
    2662             :         // UInt16 to SInt16 to be able to use min_epi16
    2663          40 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2664          40 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2665             : #endif
    2666         169 :         for (; n < nWordCount - 15; n += 16)
    2667             :         {
    2668         258 :             __m128i xmm0 = _mm_loadu_si128(
    2669         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2670         129 :             __m128i xmm1 = _mm_loadu_si128(
    2671         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2672             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2673             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2674             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2675             : #else
    2676         129 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2677         129 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2678         129 :             xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
    2679         129 :             xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
    2680         129 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2681         129 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2682             : #endif
    2683         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2684         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2685             :                              xmm1);
    2686             :         }
    2687         191 :         for (; n < nWordCount; n++)
    2688             :         {
    2689         282 :             pDstData[n] = pSrcData[n] >= 32767
    2690             :                               ? 32767
    2691         131 :                               : static_cast<int16_t>(pSrcData[n]);
    2692          40 :         }
    2693             :     }
    2694             :     else
    2695             :     {
    2696           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2697             :                               nDstPixelStride, nWordCount);
    2698             :     }
    2699          46 : }
    2700             : 
    2701             : template <>
    2702         135 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2703             :                                  int nSrcPixelStride,
    2704             :                                  uint16_t *const CPL_RESTRICT pDstData,
    2705             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2706             : {
    2707         135 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2708             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2709             :     {
    2710          92 :         decltype(nWordCount) n = 0;
    2711          92 :         const __m128i xmm_zero = _mm_setzero_si128();
    2712         277 :         for (; n < nWordCount - 15; n += 16)
    2713             :         {
    2714         370 :             __m128i xmm0 = _mm_loadu_si128(
    2715         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2716         185 :             __m128i xmm1 = _mm_loadu_si128(
    2717         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2718         185 :             xmm0 = _mm_max_epi16(xmm0, xmm_zero);
    2719         185 :             xmm1 = _mm_max_epi16(xmm1, xmm_zero);
    2720         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2721         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2722             :                              xmm1);
    2723             :         }
    2724         468 :         for (; n < nWordCount; n++)
    2725             :         {
    2726         376 :             pDstData[n] =
    2727         376 :                 pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
    2728          92 :         }
    2729             :     }
    2730             :     else
    2731             :     {
    2732          43 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2733             :                               nDstPixelStride, nWordCount);
    2734             :     }
    2735         135 : }
    2736             : 
    2737             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2738             : 
    2739             : template <>
    2740             : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
    2741             :                                  int nSrcPixelStride,
    2742             :                                  int32_t *const CPL_RESTRICT pDstData,
    2743             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2744             : {
    2745             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2746             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2747             :     {
    2748             :         decltype(nWordCount) n = 0;
    2749             :         const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
    2750             :         for (; n < nWordCount - 8; n += 7)
    2751             :         {
    2752             :             __m128i xmm0 = _mm_loadu_si128(
    2753             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2754             :             __m128i xmm1 = _mm_loadu_si128(
    2755             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2756             :             xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
    2757             :             xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
    2758             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2759             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2760             :                              xmm1);
    2761             :         }
    2762             :         for (; n < nWordCount; n++)
    2763             :         {
    2764             :             pDstData[n] = pSrcData[n] >= INT_MAX
    2765             :                               ? INT_MAX
    2766             :                               : static_cast<int32_t>(pSrcData[n]);
    2767             :         }
    2768             :     }
    2769             :     else
    2770             :     {
    2771             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2772             :                               nDstPixelStride, nWordCount);
    2773             :     }
    2774             : }
    2775             : 
    2776             : template <>
    2777             : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    2778             :                                  int nSrcPixelStride,
    2779             :                                  uint32_t *const CPL_RESTRICT pDstData,
    2780             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2781             : {
    2782             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2783             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2784             :     {
    2785             :         decltype(nWordCount) n = 0;
    2786             :         const __m128i xmm_zero = _mm_setzero_si128();
    2787             :         for (; n < nWordCount - 7; n += 8)
    2788             :         {
    2789             :             __m128i xmm0 = _mm_loadu_si128(
    2790             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2791             :             __m128i xmm1 = _mm_loadu_si128(
    2792             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2793             :             xmm0 = _mm_max_epi32(xmm0, xmm_zero);
    2794             :             xmm1 = _mm_max_epi32(xmm1, xmm_zero);
    2795             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2796             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2797             :                              xmm1);
    2798             :         }
    2799             :         for (; n < nWordCount; n++)
    2800             :         {
    2801             :             pDstData[n] =
    2802             :                 pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
    2803             :         }
    2804             :     }
    2805             :     else
    2806             :     {
    2807             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2808             :                               nDstPixelStride, nWordCount);
    2809             :     }
    2810             : }
    2811             : 
    2812             : #endif  // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2813             : 
    2814             : template <>
    2815         339 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2816             :                                  int nSrcPixelStride,
    2817             :                                  float *const CPL_RESTRICT pDstData,
    2818             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2819             : {
    2820         339 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2821             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2822             :     {
    2823         333 :         decltype(nWordCount) n = 0;
    2824         333 :         const __m128i xmm_zero = _mm_setzero_si128();
    2825         333 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2826             :             reinterpret_cast<GByte *>(pDstData);
    2827        1472 :         for (; n < nWordCount - 7; n += 8)
    2828             :         {
    2829        1139 :             __m128i xmm = _mm_loadu_si128(
    2830        1139 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2831        1139 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2832        1139 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2833        1139 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2834        1139 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2835        1139 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2836             :                           xmm0_f);
    2837             :             _mm_storeu_ps(
    2838        1139 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2839             :         }
    2840        1099 :         for (; n < nWordCount; n++)
    2841             :         {
    2842         766 :             pDstData[n] = pSrcData[n];
    2843         333 :         }
    2844             :     }
    2845             :     else
    2846             :     {
    2847           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2848             :                               nDstPixelStride, nWordCount);
    2849             :     }
    2850         339 : }
    2851             : 
    2852             : template <>
    2853     1073480 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2854             :                                  int nSrcPixelStride,
    2855             :                                  float *const CPL_RESTRICT pDstData,
    2856             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2857             : {
    2858     1073480 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2859             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2860             :     {
    2861       83576 :         decltype(nWordCount) n = 0;
    2862       83576 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2863             :             reinterpret_cast<GByte *>(pDstData);
    2864      565231 :         for (; n < nWordCount - 7; n += 8)
    2865             :         {
    2866      481655 :             __m128i xmm = _mm_loadu_si128(
    2867      481655 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2868      481655 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2869      481655 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2870      481655 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2871      481655 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2872      481655 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2873      481655 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2874             :                           xmm0_f);
    2875             :             _mm_storeu_ps(
    2876      481655 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2877             :         }
    2878      244165 :         for (; n < nWordCount; n++)
    2879             :         {
    2880      160589 :             pDstData[n] = pSrcData[n];
    2881       83576 :         }
    2882             :     }
    2883             :     else
    2884             :     {
    2885      989901 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2886             :                               nDstPixelStride, nWordCount);
    2887             :     }
    2888     1073480 : }
    2889             : 
    2890             : template <>
    2891         381 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2892             :                                  int nSrcPixelStride,
    2893             :                                  double *const CPL_RESTRICT pDstData,
    2894             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2895             : {
    2896         381 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2897             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2898             :     {
    2899         269 :         decltype(nWordCount) n = 0;
    2900         269 :         const __m128i xmm_zero = _mm_setzero_si128();
    2901         269 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2902             :             reinterpret_cast<GByte *>(pDstData);
    2903         713 :         for (; n < nWordCount - 7; n += 8)
    2904             :         {
    2905         444 :             __m128i xmm = _mm_loadu_si128(
    2906         444 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2907         444 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2908         444 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2909             : 
    2910         444 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2911         444 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2912         444 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2913         444 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2914         444 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2915         444 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2916             : 
    2917         444 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2918             :                           xmm0_low_d);
    2919             :             _mm_storeu_pd(
    2920         444 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2921             :                 xmm0_high_d);
    2922             :             _mm_storeu_pd(
    2923         444 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2924             :                 xmm1_low_d);
    2925             :             _mm_storeu_pd(
    2926         444 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2927             :                 xmm1_high_d);
    2928             :         }
    2929         918 :         for (; n < nWordCount; n++)
    2930             :         {
    2931         649 :             pDstData[n] = pSrcData[n];
    2932         269 :         }
    2933             :     }
    2934             :     else
    2935             :     {
    2936         112 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2937             :                               nDstPixelStride, nWordCount);
    2938             :     }
    2939         381 : }
    2940             : 
    2941             : template <>
    2942     2760310 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2943             :                                  int nSrcPixelStride,
    2944             :                                  double *const CPL_RESTRICT pDstData,
    2945             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2946             : {
    2947     2760310 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2948             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2949             :     {
    2950       34617 :         decltype(nWordCount) n = 0;
    2951       34617 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2952             :             reinterpret_cast<GByte *>(pDstData);
    2953      401078 :         for (; n < nWordCount - 7; n += 8)
    2954             :         {
    2955      366670 :             __m128i xmm = _mm_loadu_si128(
    2956      366670 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2957      366487 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2958      366477 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2959      366356 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2960             : 
    2961      366353 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2962      366187 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2963      366187 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2964      366285 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2965      366466 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2966      366461 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2967             : 
    2968      366461 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2969             :                           xmm0_low_d);
    2970             :             _mm_storeu_pd(
    2971      366461 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2972             :                 xmm0_high_d);
    2973             :             _mm_storeu_pd(
    2974      366461 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2975             :                 xmm1_low_d);
    2976             :             _mm_storeu_pd(
    2977      366461 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2978             :                 xmm1_high_d);
    2979             :         }
    2980      253040 :         for (; n < nWordCount; n++)
    2981             :         {
    2982      218632 :             pDstData[n] = pSrcData[n];
    2983       34408 :         }
    2984             :     }
    2985             :     else
    2986             :     {
    2987     2725700 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2988             :                               nDstPixelStride, nWordCount);
    2989             :     }
    2990     2760100 : }
    2991             : 
    2992             : template <>
    2993     4420610 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    2994             :                                  int nSrcPixelStride,
    2995             :                                  GByte *const CPL_RESTRICT pDstData,
    2996             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2997             : {
    2998     4420610 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2999             :                             nDstPixelStride, nWordCount);
    3000     4420650 : }
    3001             : 
    3002             : template <>
    3003       38365 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3004             :                                  int nSrcPixelStride,
    3005             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3006             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3007             : {
    3008       38365 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3009             :                             nDstPixelStride, nWordCount);
    3010       38365 : }
    3011             : 
    3012             : template <>
    3013       54573 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3014             :                                  int nSrcPixelStride,
    3015             :                                  double *const CPL_RESTRICT pDstData,
    3016             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3017             : {
    3018       54573 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3019             :                             nDstPixelStride, nWordCount);
    3020       54573 : }
    3021             : 
    3022             : template <>
    3023      122650 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3024             :                                  int nSrcPixelStride,
    3025             :                                  float *const CPL_RESTRICT pDstData,
    3026             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3027             : {
    3028      122650 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3029             :                             nDstPixelStride, nWordCount);
    3030      122652 : }
    3031             : 
    3032             : template <>
    3033         407 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3034             :                                  int nSrcPixelStride,
    3035             :                                  float *const CPL_RESTRICT pDstData,
    3036             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3037             : {
    3038         407 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3039             :                             nDstPixelStride, nWordCount);
    3040         407 : }
    3041             : 
    3042             : template <>
    3043         532 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3044             :                                  int nSrcPixelStride,
    3045             :                                  double *const CPL_RESTRICT pDstData,
    3046             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3047             : {
    3048         532 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3049             :                             nDstPixelStride, nWordCount);
    3050         532 : }
    3051             : 
    3052             : #ifdef __F16C__
    3053             : 
    3054             : template <>
    3055             : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3056             :                                  int nSrcPixelStride,
    3057             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3058             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3059             : {
    3060             :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3061             :                             nDstPixelStride, nWordCount);
    3062             : }
    3063             : 
    3064             : template <>
    3065             : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3066             :                                  int nSrcPixelStride,
    3067             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3068             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3069             : {
    3070             :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3071             :                             nDstPixelStride, nWordCount);
    3072             : }
    3073             : 
    3074             : #endif  // __F16C__
    3075             : 
    3076             : #endif  // HAVE_SSE2
    3077             : 
    3078             : template <>
    3079      296608 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3080             :                                  int nSrcPixelStride,
    3081             :                                  GByte *const CPL_RESTRICT pDstData,
    3082             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3083             : {
    3084      296608 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3085             :                             nDstPixelStride, nWordCount);
    3086      296616 : }
    3087             : 
    3088             : template <>
    3089       15775 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3090             :                                  int nSrcPixelStride,
    3091             :                                  GInt16 *const CPL_RESTRICT pDstData,
    3092             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3093             : {
    3094       15775 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3095             :                             nDstPixelStride, nWordCount);
    3096       15775 : }
    3097             : 
    3098             : template <>
    3099       61707 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3100             :                                  int nSrcPixelStride,
    3101             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3102             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3103             : {
    3104       61707 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3105             :                             nDstPixelStride, nWordCount);
    3106       61707 : }
    3107             : 
    3108             : /************************************************************************/
    3109             : /*                   GDALCopyWordsComplexT()                            */
    3110             : /************************************************************************/
    3111             : /**
    3112             :  * Template function, used to copy data from pSrcData into buffer
    3113             :  * pDstData, with stride nSrcPixelStride in the source data and
    3114             :  * stride nDstPixelStride in the destination data. Deals with the
    3115             :  * complex case, where input is complex and output is complex.
    3116             :  *
    3117             :  * @param pSrcData the source data buffer
    3118             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3119             :  *                      of interest.
    3120             :  * @param pDstData the destination buffer.
    3121             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3122             :  *                      interest.
    3123             :  * @param nWordCount the total number of pixel words to copy
    3124             :  *
    3125             :  */
    3126             : template <class Tin, class Tout>
    3127       97363 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    3128             :                                   int nSrcPixelStride,
    3129             :                                   Tout *const CPL_RESTRICT pDstData,
    3130             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    3131             : {
    3132       97363 :     decltype(nWordCount) nDstOffset = 0;
    3133       97363 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3134       97363 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3135             : 
    3136     5506161 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3137             :     {
    3138     5408793 :         const Tin *const pPixelIn =
    3139     5408793 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3140     5408793 :         Tout *const pPixelOut =
    3141     5408793 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3142             : 
    3143     5408793 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    3144     5408793 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    3145             : 
    3146     5408793 :         nDstOffset += nDstPixelStride;
    3147             :     }
    3148       97363 : }
    3149             : 
    3150             : /************************************************************************/
    3151             : /*                   GDALCopyWordsComplexOutT()                         */
    3152             : /************************************************************************/
    3153             : /**
    3154             :  * Template function, used to copy data from pSrcData into buffer
    3155             :  * pDstData, with stride nSrcPixelStride in the source data and
    3156             :  * stride nDstPixelStride in the destination data. Deals with the
    3157             :  * case where the value is real coming in, but complex going out.
    3158             :  *
    3159             :  * @param pSrcData the source data buffer
    3160             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3161             :  *                      of interest, in bytes.
    3162             :  * @param pDstData the destination buffer.
    3163             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3164             :  *                      interest, in bytes.
    3165             :  * @param nWordCount the total number of pixel words to copy
    3166             :  *
    3167             :  */
    3168             : template <class Tin, class Tout>
    3169        3877 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    3170             :                                      int nSrcPixelStride,
    3171             :                                      Tout *const CPL_RESTRICT pDstData,
    3172             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    3173             : {
    3174        3877 :     decltype(nWordCount) nDstOffset = 0;
    3175             : 
    3176        3877 :     const Tout tOutZero = static_cast<Tout>(0);
    3177             : 
    3178        3877 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3179        3877 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3180             : 
    3181     1155414 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3182             :     {
    3183     1151537 :         const Tin tValue =
    3184     1151537 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3185     1151537 :         Tout *const pPixelOut =
    3186     1151537 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3187     1151537 :         GDALCopyWord(tValue, *pPixelOut);
    3188             : 
    3189     1151537 :         pPixelOut[1] = tOutZero;
    3190             : 
    3191     1151537 :         nDstOffset += nDstPixelStride;
    3192             :     }
    3193        3877 : }
    3194             : 
    3195             : /************************************************************************/
    3196             : /*                           GDALCopyWordsFromT()                       */
    3197             : /************************************************************************/
    3198             : /**
    3199             :  * Template driver function. Given the input type T, call the appropriate
    3200             :  * GDALCopyWordsT function template for the desired output type. You should
    3201             :  * never call this function directly (call GDALCopyWords instead).
    3202             :  *
    3203             :  * @param pSrcData source data buffer
    3204             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    3205             :  * @param bInComplex input is complex
    3206             :  * @param pDstData destination data buffer
    3207             :  * @param eDstType destination data type
    3208             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    3209             :  * @param nWordCount number of pixel words to be copied
    3210             :  */
    3211             : template <class T>
    3212    54154740 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    3213             :                                int nSrcPixelStride, bool bInComplex,
    3214             :                                void *CPL_RESTRICT pDstData,
    3215             :                                GDALDataType eDstType, int nDstPixelStride,
    3216             :                                GPtrDiff_t nWordCount)
    3217             : {
    3218    54154740 :     switch (eDstType)
    3219             :     {
    3220     4762182 :         case GDT_Byte:
    3221     4762182 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3222             :                            static_cast<unsigned char *>(pDstData),
    3223             :                            nDstPixelStride, nWordCount);
    3224     4762255 :             break;
    3225         752 :         case GDT_Int8:
    3226         752 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3227             :                            static_cast<signed char *>(pDstData),
    3228             :                            nDstPixelStride, nWordCount);
    3229         752 :             break;
    3230      140764 :         case GDT_UInt16:
    3231      140764 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3232             :                            static_cast<unsigned short *>(pDstData),
    3233             :                            nDstPixelStride, nWordCount);
    3234      140765 :             break;
    3235     4162845 :         case GDT_Int16:
    3236     4162845 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3237             :                            static_cast<short *>(pDstData), nDstPixelStride,
    3238             :                            nWordCount);
    3239     4162845 :             break;
    3240       22239 :         case GDT_UInt32:
    3241       22239 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3242             :                            static_cast<unsigned int *>(pDstData),
    3243             :                            nDstPixelStride, nWordCount);
    3244       22239 :             break;
    3245    26047619 :         case GDT_Int32:
    3246    26047619 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3247             :                            static_cast<int *>(pDstData), nDstPixelStride,
    3248             :                            nWordCount);
    3249    26043922 :             break;
    3250         809 :         case GDT_UInt64:
    3251         809 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3252             :                            static_cast<std::uint64_t *>(pDstData),
    3253             :                            nDstPixelStride, nWordCount);
    3254         809 :             break;
    3255        5431 :         case GDT_Int64:
    3256        5431 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3257             :                            static_cast<std::int64_t *>(pDstData),
    3258             :                            nDstPixelStride, nWordCount);
    3259        5431 :             break;
    3260         974 :         case GDT_Float16:
    3261         974 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3262             :                            static_cast<GFloat16 *>(pDstData), nDstPixelStride,
    3263             :                            nWordCount);
    3264         974 :             break;
    3265     3705859 :         case GDT_Float32:
    3266     3705859 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3267             :                            static_cast<float *>(pDstData), nDstPixelStride,
    3268             :                            nWordCount);
    3269     3705861 :             break;
    3270    15203881 :         case GDT_Float64:
    3271    15203881 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3272             :                            static_cast<double *>(pDstData), nDstPixelStride,
    3273             :                            nWordCount);
    3274    15203921 :             break;
    3275       94123 :         case GDT_CInt16:
    3276       94123 :             if (bInComplex)
    3277             :             {
    3278       92870 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3279             :                                       static_cast<short *>(pDstData),
    3280             :                                       nDstPixelStride, nWordCount);
    3281             :             }
    3282             :             else  // input is not complex, so we need to promote to a complex
    3283             :                   // buffer
    3284             :             {
    3285        1253 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3286             :                                          static_cast<short *>(pDstData),
    3287             :                                          nDstPixelStride, nWordCount);
    3288             :             }
    3289       94123 :             break;
    3290        1040 :         case GDT_CInt32:
    3291        1040 :             if (bInComplex)
    3292             :             {
    3293         409 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3294             :                                       static_cast<int *>(pDstData),
    3295             :                                       nDstPixelStride, nWordCount);
    3296             :             }
    3297             :             else  // input is not complex, so we need to promote to a complex
    3298             :                   // buffer
    3299             :             {
    3300         631 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3301             :                                          static_cast<int *>(pDstData),
    3302             :                                          nDstPixelStride, nWordCount);
    3303             :             }
    3304        1040 :             break;
    3305         313 :         case GDT_CFloat16:
    3306         313 :             if (bInComplex)
    3307             :             {
    3308          48 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3309             :                                       static_cast<GFloat16 *>(pDstData),
    3310             :                                       nDstPixelStride, nWordCount);
    3311             :             }
    3312             :             else  // input is not complex, so we need to promote to a complex
    3313             :                   // buffer
    3314             :             {
    3315         265 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3316             :                                          static_cast<GFloat16 *>(pDstData),
    3317             :                                          nDstPixelStride, nWordCount);
    3318             :             }
    3319         313 :             break;
    3320        3473 :         case GDT_CFloat32:
    3321        3473 :             if (bInComplex)
    3322             :             {
    3323        2678 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3324             :                                       static_cast<float *>(pDstData),
    3325             :                                       nDstPixelStride, nWordCount);
    3326             :             }
    3327             :             else  // input is not complex, so we need to promote to a complex
    3328             :                   // buffer
    3329             :             {
    3330         795 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3331             :                                          static_cast<float *>(pDstData),
    3332             :                                          nDstPixelStride, nWordCount);
    3333             :             }
    3334        3473 :             break;
    3335        2291 :         case GDT_CFloat64:
    3336        2291 :             if (bInComplex)
    3337             :             {
    3338        1358 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3339             :                                       static_cast<double *>(pDstData),
    3340             :                                       nDstPixelStride, nWordCount);
    3341             :             }
    3342             :             else  // input is not complex, so we need to promote to a complex
    3343             :                   // buffer
    3344             :             {
    3345         933 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3346             :                                          static_cast<double *>(pDstData),
    3347             :                                          nDstPixelStride, nWordCount);
    3348             :             }
    3349        2291 :             break;
    3350           0 :         case GDT_Unknown:
    3351             :         case GDT_TypeCount:
    3352           0 :             CPLAssert(false);
    3353             :     }
    3354    54151189 : }
    3355             : 
    3356             : }  // end anonymous namespace
    3357             : 
    3358             : /************************************************************************/
    3359             : /*                          GDALReplicateWord()                         */
    3360             : /************************************************************************/
    3361             : 
    3362             : template <class T>
    3363      588569 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    3364             :                                GPtrDiff_t nWordCount)
    3365             : {
    3366      588569 :     const T valSet = *static_cast<const T *>(pDstData);
    3367      588569 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    3368             :     {
    3369      558942 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    3370    21256073 :         while (nWordCount >= 4)
    3371             :         {
    3372    20697148 :             nWordCount -= 4;
    3373    20697148 :             pDstPtr[0] = valSet;
    3374    20697148 :             pDstPtr[1] = valSet;
    3375    20697148 :             pDstPtr[2] = valSet;
    3376    20697148 :             pDstPtr[3] = valSet;
    3377    20697148 :             pDstPtr += 4;
    3378             :         }
    3379     1460040 :         while (nWordCount > 0)
    3380             :         {
    3381      901098 :             --nWordCount;
    3382      901098 :             *pDstPtr = valSet;
    3383      901098 :             pDstPtr++;
    3384             :         }
    3385             :     }
    3386             :     else
    3387             :     {
    3388       29751 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    3389     1040338 :         while (nWordCount > 0)
    3390             :         {
    3391     1010587 :             --nWordCount;
    3392     1010587 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    3393     1010587 :             pabyDstPtr += nDstPixelStride;
    3394             :         }
    3395             :     }
    3396      588569 : }
    3397             : 
    3398     1046560 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    3399             :                               GDALDataType eSrcType,
    3400             :                               void *CPL_RESTRICT pDstData,
    3401             :                               GDALDataType eDstType, int nDstPixelStride,
    3402             :                               GPtrDiff_t nWordCount)
    3403             : {
    3404             :     /* -----------------------------------------------------------------------
    3405             :      */
    3406             :     /* Special case when the source data is always the same value */
    3407             :     /* (for VRTSourcedRasterBand::IRasterIO and
    3408             :      * VRTDerivedRasterBand::IRasterIO*/
    3409             :     /*  for example) */
    3410             :     /* -----------------------------------------------------------------------
    3411             :      */
    3412             :     // Let the general translation case do the necessary conversions
    3413             :     // on the first destination element.
    3414     1046560 :     GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    3415             : 
    3416             :     // Now copy the first element to the nWordCount - 1 following destination
    3417             :     // elements.
    3418     1040420 :     nWordCount--;
    3419     1040420 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    3420             : 
    3421     1040420 :     switch (eDstType)
    3422             :     {
    3423      451546 :         case GDT_Byte:
    3424             :         case GDT_Int8:
    3425             :         {
    3426      451546 :             if (nDstPixelStride == 1)
    3427             :             {
    3428      380029 :                 if (nWordCount > 0)
    3429      380029 :                     memset(pabyDstWord,
    3430      380029 :                            *reinterpret_cast<const GByte *>(pDstData),
    3431             :                            nWordCount);
    3432             :             }
    3433             :             else
    3434             :             {
    3435       71517 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    3436    50338700 :                 while (nWordCount > 0)
    3437             :                 {
    3438    50267200 :                     --nWordCount;
    3439    50267200 :                     *pabyDstWord = valSet;
    3440    50267200 :                     pabyDstWord += nDstPixelStride;
    3441             :                 }
    3442             :             }
    3443      451546 :             break;
    3444             :         }
    3445             : 
    3446             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    3447             :     case enum_type:                                                            \
    3448             :     {                                                                          \
    3449             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    3450             :         break;                                                                 \
    3451             :     }
    3452             : 
    3453       34505 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    3454      202447 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    3455          56 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3456      292878 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3457          23 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3458        1066 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3459           0 :             CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
    3460       52662 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3461        5224 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3462             : 
    3463             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3464             :     case enum_type:                                                            \
    3465             :     {                                                                          \
    3466             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3467             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3468             :         while (nWordCount > 0)                                                 \
    3469             :         {                                                                      \
    3470             :             --nWordCount;                                                      \
    3471             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3472             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3473             :             pabyDstWord += nDstPixelStride;                                    \
    3474             :         }                                                                      \
    3475             :         break;                                                                 \
    3476             :     }
    3477             : 
    3478         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3479         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3480           6 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
    3481         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3482         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3483             : 
    3484           0 :         case GDT_Unknown:
    3485             :         case GDT_TypeCount:
    3486           0 :             CPLAssert(false);
    3487             :     }
    3488     1047770 : }
    3489             : 
    3490             : /************************************************************************/
    3491             : /*                        GDALUnrolledCopy()                            */
    3492             : /************************************************************************/
    3493             : 
    3494             : template <class T, int srcStride, int dstStride>
    3495     3019101 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3496             :                                            const T *CPL_RESTRICT pSrc,
    3497             :                                            GPtrDiff_t nIters)
    3498             : {
    3499     3019101 :     if (nIters >= 16)
    3500             :     {
    3501   132918503 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3502             :         {
    3503   130026053 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3504   130026053 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3505   130026053 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3506   130026053 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3507   130026053 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3508   130026053 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3509   130026053 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3510   130026053 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3511   130026053 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3512   130026053 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3513   130026053 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3514   130026053 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3515   130026053 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3516   130026053 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3517   130026053 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3518   130026053 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3519   130026053 :             pDest += 16 * dstStride;
    3520   130026053 :             pSrc += 16 * srcStride;
    3521             :         }
    3522     2892446 :         nIters = nIters % 16;
    3523             :     }
    3524     5173893 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3525             :     {
    3526     2154803 :         pDest[i * dstStride] = *pSrc;
    3527     2154803 :         pSrc += srcStride;
    3528             :     }
    3529     3019101 : }
    3530             : 
    3531             : template <class T, int srcStride, int dstStride>
    3532     3012991 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3533             :                                     const T *CPL_RESTRICT pSrc,
    3534             :                                     GPtrDiff_t nIters)
    3535             : {
    3536     3012991 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3537     3013013 : }
    3538             : 
    3539             : #ifdef HAVE_SSE2
    3540             : 
    3541             : template <>
    3542      353436 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3543             :                                    const GByte *CPL_RESTRICT pSrc,
    3544             :                                    GPtrDiff_t nIters)
    3545             : {
    3546      353436 :     decltype(nIters) i = 0;
    3547      353436 :     if (nIters > 16)
    3548             :     {
    3549      195179 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3550             :         // If we were sure that there would always be 1 trailing byte, we could
    3551             :         // check against nIters - 15
    3552     2996300 :         for (; i < nIters - 16; i += 16)
    3553             :         {
    3554             :             __m128i xmm0 =
    3555     2801120 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3556             :             __m128i xmm1 =
    3557     5602250 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3558             :             // Set higher 8bit of each int16 packed word to 0
    3559     2801120 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3560     2801120 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3561             :             // Pack int16 to uint8 and merge back both vector
    3562     2801120 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3563             : 
    3564             :             // Store result
    3565     2801120 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3566             : 
    3567     2801120 :             pSrc += 2 * 16;
    3568             :         }
    3569             :     }
    3570     4628680 :     for (; i < nIters; i++)
    3571             :     {
    3572     4275240 :         pDest[i] = *pSrc;
    3573     4275240 :         pSrc += 2;
    3574             :     }
    3575      353436 : }
    3576             : 
    3577             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3578             : 
    3579             : template <>
    3580      192064 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3581             :                                    const GByte *CPL_RESTRICT pSrc,
    3582             :                                    GPtrDiff_t nIters)
    3583             : {
    3584      192064 :     if (nIters > 16 && CPLHaveRuntimeSSSE3())
    3585             :     {
    3586      185964 :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3587             :     }
    3588             :     else
    3589             :     {
    3590        6100 :         GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
    3591             :     }
    3592      192064 : }
    3593             : 
    3594             : #endif
    3595             : 
    3596             : template <>
    3597      106635 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    3598             :                                    const GByte *CPL_RESTRICT pSrc,
    3599             :                                    GPtrDiff_t nIters)
    3600             : {
    3601      106635 :     decltype(nIters) i = 0;
    3602      106635 :     if (nIters > 16)
    3603             :     {
    3604      101342 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    3605             :         // If we were sure that there would always be 3 trailing bytes, we could
    3606             :         // check against nIters - 15
    3607    11322100 :         for (; i < nIters - 16; i += 16)
    3608             :         {
    3609             :             __m128i xmm0 =
    3610    11219400 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3611             :             __m128i xmm1 =
    3612    11219400 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3613             :             __m128i xmm2 =
    3614    11219400 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3615             :             __m128i xmm3 =
    3616    22438800 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    3617             :             // Set higher 24bit of each int32 packed word to 0
    3618    11219400 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3619    11219400 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3620    11219400 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    3621    11219400 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    3622             :             // Pack int32 to int16
    3623    11219600 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    3624    11219600 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    3625             :             // Pack int16 to uint8
    3626    11220800 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    3627             : 
    3628             :             // Store result
    3629    11220800 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3630             : 
    3631    11220800 :             pSrc += 4 * 16;
    3632             :         }
    3633             :     }
    3634     1143410 :     for (; i < nIters; i++)
    3635             :     {
    3636     1035440 :         pDest[i] = *pSrc;
    3637     1035440 :         pSrc += 4;
    3638             :     }
    3639      107966 : }
    3640             : #endif  // HAVE_SSE2
    3641             : 
    3642             : /************************************************************************/
    3643             : /*                         GDALFastCopy()                               */
    3644             : /************************************************************************/
    3645             : 
    3646             : template <class T>
    3647    39713800 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    3648             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    3649             :                                 GPtrDiff_t nIters)
    3650             : {
    3651    39713800 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    3652    39713800 :     if (nIters == 1)
    3653             :     {
    3654    22295020 :         *pDest = *pSrc;
    3655             :     }
    3656    17418739 :     else if (nDestStride == sizeofT)
    3657             :     {
    3658    14334110 :         if (nSrcStride == sizeofT)
    3659             :         {
    3660    13477894 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    3661             :         }
    3662      856236 :         else if (nSrcStride == 2 * sizeofT)
    3663             :         {
    3664      356651 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    3665             :         }
    3666      499585 :         else if (nSrcStride == 3 * sizeofT)
    3667             :         {
    3668      288642 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    3669             :         }
    3670      210943 :         else if (nSrcStride == 4 * sizeofT)
    3671             :         {
    3672      110617 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    3673             :         }
    3674             :         else
    3675             :         {
    3676    17216590 :             while (nIters-- > 0)
    3677             :             {
    3678    17116250 :                 *pDest = *pSrc;
    3679    17116250 :                 pSrc += nSrcStride / sizeofT;
    3680    17116250 :                 pDest++;
    3681             :             }
    3682             :         }
    3683             :     }
    3684     3084669 :     else if (nSrcStride == sizeofT)
    3685             :     {
    3686     3073032 :         if (nDestStride == 2 * sizeofT)
    3687             :         {
    3688      150067 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    3689             :         }
    3690     2922961 :         else if (nDestStride == 3 * sizeofT)
    3691             :         {
    3692     2096127 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    3693             :         }
    3694      826833 :         else if (nDestStride == 4 * sizeofT)
    3695             :         {
    3696      663021 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    3697             :         }
    3698             :         else
    3699             :         {
    3700    17119160 :             while (nIters-- > 0)
    3701             :             {
    3702    16955410 :                 *pDest = *pSrc;
    3703    16955410 :                 pSrc++;
    3704    16955410 :                 pDest += nDestStride / sizeofT;
    3705             :             }
    3706             :         }
    3707             :     }
    3708             :     else
    3709             :     {
    3710     1218739 :         while (nIters-- > 0)
    3711             :         {
    3712     1207102 :             *pDest = *pSrc;
    3713     1207102 :             pSrc += nSrcStride / sizeofT;
    3714     1207102 :             pDest += nDestStride / sizeofT;
    3715             :         }
    3716             :     }
    3717    39713900 : }
    3718             : 
    3719             : /************************************************************************/
    3720             : /*                         GDALFastCopyByte()                           */
    3721             : /************************************************************************/
    3722             : 
    3723      326250 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    3724             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    3725             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    3726             : {
    3727      326250 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    3728             :                  nWordCount);
    3729      326250 : }
    3730             : 
    3731             : /************************************************************************/
    3732             : /*                           GDALCopyWords()                            */
    3733             : /************************************************************************/
    3734             : 
    3735             : /**
    3736             :  * Copy pixel words from buffer to buffer.
    3737             :  *
    3738             :  * @see GDALCopyWords64()
    3739             :  */
    3740    78070700 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    3741             :                                GDALDataType eSrcType, int nSrcPixelStride,
    3742             :                                void *CPL_RESTRICT pDstData,
    3743             :                                GDALDataType eDstType, int nDstPixelStride,
    3744             :                                int nWordCount)
    3745             : {
    3746    78070700 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    3747             :                     nDstPixelStride, nWordCount);
    3748    78070700 : }
    3749             : 
    3750             : /************************************************************************/
    3751             : /*                          GDALCopyWords64()                           */
    3752             : /************************************************************************/
    3753             : 
    3754             : /**
    3755             :  * Copy pixel words from buffer to buffer.
    3756             :  *
    3757             :  * This function is used to copy pixel word values from one memory buffer
    3758             :  * to another, with support for conversion between data types, and differing
    3759             :  * step factors. The data type conversion is done using the following
    3760             :  * rules:
    3761             :  * <ul>
    3762             :  * <li>Values assigned to a lower range integer type are clipped. For
    3763             :  * instance assigning GDT_Int16 values to a GDT_Byte buffer will cause values
    3764             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    3765             :  * </li>
    3766             :  * <li>
    3767             :  * Assignment from floating point to integer rounds to closest integer.
    3768             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    3769             :  * smallest integer. NaN is mapped to 0.
    3770             :  * </li>
    3771             :  * <li>
    3772             :  * Assignment from non-complex to complex will result in the imaginary part
    3773             :  * being set to zero on output.
    3774             :  * </li>
    3775             :  * <li> Assignment from complex to
    3776             :  * non-complex will result in the complex portion being lost and the real
    3777             :  * component being preserved (<i>not magnitude!</i>).
    3778             :  * </li>
    3779             :  * </ul>
    3780             :  *
    3781             :  * No assumptions are made about the source or destination words occurring
    3782             :  * on word boundaries.  It is assumed that all values are in native machine
    3783             :  * byte order.
    3784             :  *
    3785             :  * @param pSrcData Pointer to source data to be converted.
    3786             :  * @param eSrcType the source data type (see GDALDataType enum)
    3787             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    3788             :  * in bytes
    3789             :  * @param pDstData Pointer to buffer where destination data should go
    3790             :  * @param eDstType the destination data type (see GDALDataType enum)
    3791             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    3792             :  * words), in bytes
    3793             :  * @param nWordCount number of words to be copied
    3794             :  *
    3795             :  * @note
    3796             :  * When adding a new data type to GDAL, you must do the following to
    3797             :  * support it properly within the GDALCopyWords function:
    3798             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    3799             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    3800             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    3801             :  *    This should call the appropriate GDALCopyWordsT template.
    3802             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    3803             :  *    above namespace. This will ensure that any conversion issues are
    3804             :  *    handled (cases like the float -> int32 case, where the min/max)
    3805             :  *    values are subject to roundoff error.
    3806             :  */
    3807             : 
    3808   108685000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    3809             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    3810             :                                  void *CPL_RESTRICT pDstData,
    3811             :                                  GDALDataType eDstType, int nDstPixelStride,
    3812             :                                  GPtrDiff_t nWordCount)
    3813             : 
    3814             : {
    3815             :     // On platforms where alignment matters, be careful
    3816   108685000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    3817   108683000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    3818   108684000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    3819             :     {
    3820           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    3821             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    3822             :                  "argument");
    3823           2 :         return;
    3824             :     }
    3825   108684000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    3826    58948600 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    3827    58946100 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    3828    58945900 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    3829    58945700 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    3830             :     {
    3831         905 :         if (eSrcType == eDstType)
    3832             :         {
    3833       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3834             :             {
    3835       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3836             :                        static_cast<const GByte *>(pSrcData) +
    3837       34000 :                            nSrcPixelStride * i,
    3838             :                        nDstDataTypeSize);
    3839             :             }
    3840             :         }
    3841             :         else
    3842             :         {
    3843         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    3844             :             {
    3845             :                 return ptr +
    3846         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    3847         210 :                         align);
    3848             :             };
    3849             : 
    3850             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    3851             :             // be sure to get correctly aligned pointer.
    3852         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    3853             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    3854             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    3855             :             GByte *pabySrcBuffer =
    3856         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    3857             :             GByte *pabyDstBuffer =
    3858         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    3859        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3860             :             {
    3861        3255 :                 memcpy(pabySrcBuffer,
    3862             :                        static_cast<const GByte *>(pSrcData) +
    3863        3255 :                            nSrcPixelStride * i,
    3864             :                        nSrcDataTypeSize);
    3865        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    3866             :                                 eDstType, 0, 1);
    3867        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3868             :                        pabyDstBuffer, nDstDataTypeSize);
    3869             :             }
    3870             :         }
    3871         905 :         return;
    3872             :     }
    3873             : 
    3874             :     // Deal with the case where we're replicating a single word into the
    3875             :     // provided buffer
    3876   108683000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    3877             :     {
    3878     1047130 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    3879             :                           nDstPixelStride, nWordCount);
    3880     1047880 :         return;
    3881             :     }
    3882             : 
    3883   107636000 :     if (eSrcType == eDstType)
    3884             :     {
    3885    53608400 :         if (eSrcType == GDT_Byte || eSrcType == GDT_Int8)
    3886             :         {
    3887    17939200 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    3888             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    3889             :                          nWordCount);
    3890    17939400 :             return;
    3891             :         }
    3892             : 
    3893    35669200 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    3894    21450000 :             (nDstPixelStride % 2) == 0)
    3895             :         {
    3896    21450000 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    3897             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    3898             :                          nWordCount);
    3899    21449800 :             return;
    3900             :         }
    3901             : 
    3902    14219200 :         if (nWordCount == 1)
    3903             :         {
    3904             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    3905             :             // Avoid false positives...
    3906             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    3907             : #else
    3908    13899000 :             if (nSrcDataTypeSize == 2)
    3909           0 :                 memcpy(pDstData, pSrcData, 2);
    3910    13899000 :             else if (nSrcDataTypeSize == 4)
    3911    13811800 :                 memcpy(pDstData, pSrcData, 4);
    3912       87185 :             else if (nSrcDataTypeSize == 8)
    3913       70580 :                 memcpy(pDstData, pSrcData, 8);
    3914             :             else /* if( eSrcType == GDT_CFloat64 ) */
    3915       16605 :                 memcpy(pDstData, pSrcData, 16);
    3916             : #endif
    3917    13899000 :             return;
    3918             :         }
    3919             : 
    3920             :         // Let memcpy() handle the case where we're copying a packed buffer
    3921             :         // of pixels.
    3922      320226 :         if (nSrcPixelStride == nDstPixelStride)
    3923             :         {
    3924      192202 :             if (nSrcPixelStride == nSrcDataTypeSize)
    3925             :             {
    3926      192134 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    3927      192134 :                 return;
    3928             :             }
    3929             :         }
    3930             :     }
    3931             : 
    3932             :     // Handle the more general case -- deals with conversion of data types
    3933             :     // directly.
    3934    54155500 :     switch (eSrcType)
    3935             :     {
    3936    15508100 :         case GDT_Byte:
    3937    15508100 :             GDALCopyWordsFromT<unsigned char>(
    3938             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    3939             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3940    15503800 :             break;
    3941        1248 :         case GDT_Int8:
    3942        1248 :             GDALCopyWordsFromT<signed char>(
    3943             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    3944             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3945        1248 :             break;
    3946       54265 :         case GDT_UInt16:
    3947       54265 :             GDALCopyWordsFromT<unsigned short>(
    3948             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    3949             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3950       54265 :             break;
    3951     4353700 :         case GDT_Int16:
    3952     4353700 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3953             :                                       nSrcPixelStride, false, pDstData,
    3954             :                                       eDstType, nDstPixelStride, nWordCount);
    3955     4353700 :             break;
    3956        7107 :         case GDT_UInt32:
    3957        7107 :             GDALCopyWordsFromT<unsigned int>(
    3958             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    3959             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3960        7107 :             break;
    3961    12255100 :         case GDT_Int32:
    3962    12255100 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3963             :                                     nSrcPixelStride, false, pDstData, eDstType,
    3964             :                                     nDstPixelStride, nWordCount);
    3965    12255100 :             break;
    3966        1641 :         case GDT_UInt64:
    3967        1641 :             GDALCopyWordsFromT<std::uint64_t>(
    3968             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    3969             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3970        1641 :             break;
    3971       11270 :         case GDT_Int64:
    3972       11270 :             GDALCopyWordsFromT<std::int64_t>(
    3973             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    3974             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3975       11270 :             break;
    3976        1370 :         case GDT_Float16:
    3977        1370 :             GDALCopyWordsFromT<GFloat16>(
    3978             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
    3979             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    3980        1370 :             break;
    3981      504812 :         case GDT_Float32:
    3982      504812 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3983             :                                       nSrcPixelStride, false, pDstData,
    3984             :                                       eDstType, nDstPixelStride, nWordCount);
    3985      504811 :             break;
    3986    20698300 :         case GDT_Float64:
    3987    20698300 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3988             :                                        nSrcPixelStride, false, pDstData,
    3989             :                                        eDstType, nDstPixelStride, nWordCount);
    3990    20698300 :             break;
    3991      478141 :         case GDT_CInt16:
    3992      478141 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3993             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3994             :                                       nDstPixelStride, nWordCount);
    3995      478141 :             break;
    3996         556 :         case GDT_CInt32:
    3997         556 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3998             :                                     nSrcPixelStride, true, pDstData, eDstType,
    3999             :                                     nDstPixelStride, nWordCount);
    4000         556 :             break;
    4001         508 :         case GDT_CFloat16:
    4002         508 :             GDALCopyWordsFromT<GFloat16>(
    4003             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
    4004             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4005         508 :             break;
    4006        2077 :         case GDT_CFloat32:
    4007        2077 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4008             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4009             :                                       nDstPixelStride, nWordCount);
    4010        2077 :             break;
    4011      276974 :         case GDT_CFloat64:
    4012      276974 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4013             :                                        nSrcPixelStride, true, pDstData,
    4014             :                                        eDstType, nDstPixelStride, nWordCount);
    4015      276974 :             break;
    4016           0 :         case GDT_Unknown:
    4017             :         case GDT_TypeCount:
    4018           0 :             CPLAssert(false);
    4019             :     }
    4020             : }
    4021             : 
    4022             : /************************************************************************/
    4023             : /*                            GDALCopyBits()                            */
    4024             : /************************************************************************/
    4025             : 
    4026             : /**
    4027             :  * Bitwise word copying.
    4028             :  *
    4029             :  * A function for moving sets of partial bytes around.  Loosely
    4030             :  * speaking this is a bitwise analog to GDALCopyWords().
    4031             :  *
    4032             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    4033             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    4034             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    4035             :  * and nDstOffset are the offset into the source and destination buffers
    4036             :  * to start at, also measured in bits.
    4037             :  *
    4038             :  * All bit offsets are assumed to start from the high order bit in a byte
    4039             :  * (i.e. most significant bit first).  Currently this function is not very
    4040             :  * optimized, but it may be improved for some common cases in the future
    4041             :  * as needed.
    4042             :  *
    4043             :  * @param pabySrcData the source data buffer.
    4044             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    4045             :  * first word to copy.
    4046             :  * @param nSrcStep the offset in bits from the start one source word to the
    4047             :  * start of the next.
    4048             :  * @param pabyDstData the destination data buffer.
    4049             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    4050             :  * first word to copy over.
    4051             :  * @param nDstStep the offset in bits from the start one word to the
    4052             :  * start of the next.
    4053             :  * @param nBitCount the number of bits in a word to be copied.
    4054             :  * @param nStepCount the number of words to copy.
    4055             :  */
    4056             : 
    4057           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    4058             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    4059             :                   int nBitCount, int nStepCount)
    4060             : 
    4061             : {
    4062           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    4063             : 
    4064           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    4065             :     {
    4066           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    4067             :         {
    4068           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    4069           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    4070             :             else
    4071           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    4072             : 
    4073           0 :             nSrcOffset++;
    4074           0 :             nDstOffset++;
    4075             :         }
    4076             : 
    4077           0 :         nSrcOffset += (nSrcStep - nBitCount);
    4078           0 :         nDstOffset += (nDstStep - nBitCount);
    4079             :     }
    4080             : }
    4081             : 
    4082             : /************************************************************************/
    4083             : /*                    GDALGetBestOverviewLevel()                        */
    4084             : /*                                                                      */
    4085             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4086             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    4087             : /* returning a valid overview level                                     */
    4088             : /************************************************************************/
    4089             : 
    4090           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    4091             :                                  int &nXSize, int &nYSize, int nBufXSize,
    4092             :                                  int nBufYSize)
    4093             : {
    4094           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    4095           0 :                                          nBufXSize, nBufYSize, nullptr);
    4096             : }
    4097             : 
    4098      523977 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    4099             :                                   int &nYOff, int &nXSize, int &nYSize,
    4100             :                                   int nBufXSize, int nBufYSize,
    4101             :                                   GDALRasterIOExtraArg *psExtraArg)
    4102             : {
    4103      523977 :     if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
    4104      523977 :         psExtraArg->bUseOnlyThisScale)
    4105         109 :         return -1;
    4106             :     /* -------------------------------------------------------------------- */
    4107             :     /*      Compute the desired downsampling factor.  It is                 */
    4108             :     /*      based on the least reduced axis, and represents the number      */
    4109             :     /*      of source pixels to one destination pixel.                      */
    4110             :     /* -------------------------------------------------------------------- */
    4111      523868 :     const double dfDesiredDownsamplingFactor =
    4112      523868 :         ((nXSize / static_cast<double>(nBufXSize)) <
    4113      361530 :              (nYSize / static_cast<double>(nBufYSize)) ||
    4114             :          nBufYSize == 1)
    4115      752255 :             ? nXSize / static_cast<double>(nBufXSize)
    4116      133143 :             : nYSize / static_cast<double>(nBufYSize);
    4117             : 
    4118             :     /* -------------------------------------------------------------------- */
    4119             :     /*      Find the overview level that largest downsampling factor (most  */
    4120             :     /*      downsampled) that is still less than (or only a little more)    */
    4121             :     /*      downsampled than the request.                                   */
    4122             :     /* -------------------------------------------------------------------- */
    4123      523868 :     const int nOverviewCount = poBand->GetOverviewCount();
    4124      523868 :     GDALRasterBand *poBestOverview = nullptr;
    4125      523868 :     double dfBestDownsamplingFactor = 0;
    4126      523868 :     int nBestOverviewLevel = -1;
    4127             : 
    4128             :     const char *pszOversampligThreshold =
    4129      523868 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    4130             : 
    4131             :     // Note: keep this logic for overview selection in sync between
    4132             :     // gdalwarp_lib.cpp and rasterio.cpp
    4133             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    4134             :     const double dfOversamplingThreshold =
    4135     1047730 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    4136      523859 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    4137     1047720 :             ? 1.0
    4138      523868 :             : 1.2;
    4139      526564 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4140             :     {
    4141        5612 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    4142       11224 :         if (poOverview == nullptr ||
    4143       11223 :             poOverview->GetXSize() > poBand->GetXSize() ||
    4144        5611 :             poOverview->GetYSize() > poBand->GetYSize())
    4145             :         {
    4146           1 :             continue;
    4147             :         }
    4148             : 
    4149             :         // Compute downsampling factor of this overview
    4150             :         const double dfDownsamplingFactor = std::min(
    4151        5611 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    4152       11222 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    4153             : 
    4154             :         // Is it nearly the requested factor and better (lower) than
    4155             :         // the current best factor?
    4156             :         // Use an epsilon because of numerical instability.
    4157        5611 :         constexpr double EPSILON = 1e-1;
    4158        5719 :         if (dfDownsamplingFactor >=
    4159        5611 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold +
    4160        5503 :                     EPSILON ||
    4161             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    4162             :         {
    4163         108 :             continue;
    4164             :         }
    4165             : 
    4166             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    4167        5503 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    4168             : 
    4169        5503 :         if (pszResampling != nullptr &&
    4170          71 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    4171          16 :             continue;
    4172             : 
    4173             :         // OK, this is our new best overview.
    4174        5487 :         poBestOverview = poOverview;
    4175        5487 :         nBestOverviewLevel = iOverview;
    4176        5487 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    4177             : 
    4178        5487 :         if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
    4179             :             EPSILON)
    4180             :         {
    4181        2916 :             break;
    4182             :         }
    4183             :     }
    4184             : 
    4185             :     /* -------------------------------------------------------------------- */
    4186             :     /*      If we didn't find an overview that helps us, just return        */
    4187             :     /*      indicating failure and the full resolution image will be used.  */
    4188             :     /* -------------------------------------------------------------------- */
    4189      523868 :     if (nBestOverviewLevel < 0)
    4190      520879 :         return -1;
    4191             : 
    4192             :     /* -------------------------------------------------------------------- */
    4193             :     /*      Recompute the source window in terms of the selected            */
    4194             :     /*      overview.                                                       */
    4195             :     /* -------------------------------------------------------------------- */
    4196             :     const double dfXFactor =
    4197        2989 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    4198             :     const double dfYFactor =
    4199        2989 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    4200        2989 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    4201             :              poBestOverview->GetYSize());
    4202             : 
    4203        8967 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    4204        2989 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    4205        8967 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    4206        2989 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    4207        2989 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    4208        2989 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    4209        2989 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    4210           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    4211        2989 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    4212           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    4213             : 
    4214        2989 :     if (psExtraArg)
    4215             :     {
    4216        2989 :         if (psExtraArg->bFloatingPointWindowValidity)
    4217             :         {
    4218         115 :             psExtraArg->dfXOff /= dfXFactor;
    4219         115 :             psExtraArg->dfXSize /= dfXFactor;
    4220         115 :             psExtraArg->dfYOff /= dfYFactor;
    4221         115 :             psExtraArg->dfYSize /= dfYFactor;
    4222             :         }
    4223        2874 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    4224             :         {
    4225          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    4226          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    4227          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    4228          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    4229          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    4230             :         }
    4231             :     }
    4232             : 
    4233        2989 :     nXOff = nOXOff;
    4234        2989 :     nYOff = nOYOff;
    4235        2989 :     nXSize = nOXSize;
    4236        2989 :     nYSize = nOYSize;
    4237             : 
    4238        2989 :     return nBestOverviewLevel;
    4239             : }
    4240             : 
    4241             : /************************************************************************/
    4242             : /*                          OverviewRasterIO()                          */
    4243             : /*                                                                      */
    4244             : /*      Special work function to utilize available overviews to         */
    4245             : /*      more efficiently satisfy downsampled requests.  It will         */
    4246             : /*      return CE_Failure if there are no appropriate overviews         */
    4247             : /*      available but it doesn't emit any error messages.               */
    4248             : /************************************************************************/
    4249             : 
    4250             : //! @cond Doxygen_Suppress
    4251           2 : CPLErr GDALRasterBand::OverviewRasterIO(
    4252             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4253             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4254             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    4255             : 
    4256             : {
    4257             :     GDALRasterIOExtraArg sExtraArg;
    4258           2 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4259             : 
    4260           2 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    4261             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    4262           2 :     if (nOverview < 0)
    4263           1 :         return CE_Failure;
    4264             : 
    4265             :     /* -------------------------------------------------------------------- */
    4266             :     /*      Recast the call in terms of the new raster layer.               */
    4267             :     /* -------------------------------------------------------------------- */
    4268           1 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    4269           1 :     if (poOverviewBand == nullptr)
    4270           0 :         return CE_Failure;
    4271             : 
    4272           1 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4273             :                                     pData, nBufXSize, nBufYSize, eBufType,
    4274           1 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    4275             : }
    4276             : 
    4277             : /************************************************************************/
    4278             : /*                      TryOverviewRasterIO()                           */
    4279             : /************************************************************************/
    4280             : 
    4281      362417 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    4282             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4283             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4284             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    4285             :     int *pbTried)
    4286             : {
    4287      362417 :     int nXOffMod = nXOff;
    4288      362417 :     int nYOffMod = nYOff;
    4289      362417 :     int nXSizeMod = nXSize;
    4290      362417 :     int nYSizeMod = nYSize;
    4291             :     GDALRasterIOExtraArg sExtraArg;
    4292             : 
    4293      362417 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4294             : 
    4295      362417 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4296             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    4297             :         &sExtraArg);
    4298             : 
    4299      362417 :     if (iOvrLevel >= 0)
    4300             :     {
    4301          50 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    4302          50 :         if (poOverviewBand)
    4303             :         {
    4304          50 :             *pbTried = TRUE;
    4305          50 :             return poOverviewBand->RasterIO(
    4306             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    4307             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    4308          50 :                 &sExtraArg);
    4309             :         }
    4310             :     }
    4311             : 
    4312      362367 :     *pbTried = FALSE;
    4313      362367 :     return CE_None;
    4314             : }
    4315             : 
    4316             : /************************************************************************/
    4317             : /*                      TryOverviewRasterIO()                           */
    4318             : /************************************************************************/
    4319             : 
    4320      158584 : CPLErr GDALDataset::TryOverviewRasterIO(
    4321             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4322             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4323             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4324             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    4325             :     int *pbTried)
    4326             : {
    4327      158584 :     int nXOffMod = nXOff;
    4328      158584 :     int nYOffMod = nYOff;
    4329      158584 :     int nXSizeMod = nXSize;
    4330      158584 :     int nYSizeMod = nYSize;
    4331             :     GDALRasterIOExtraArg sExtraArg;
    4332      158584 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4333             : 
    4334      317168 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4335      158584 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    4336             :         nBufYSize, &sExtraArg);
    4337             : 
    4338      158625 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    4339          41 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    4340             :     {
    4341          41 :         *pbTried = TRUE;
    4342          41 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    4343             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    4344             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    4345          41 :             nLineSpace, nBandSpace, &sExtraArg);
    4346             :     }
    4347             :     else
    4348             :     {
    4349      158543 :         *pbTried = FALSE;
    4350      158543 :         return CE_None;
    4351             :     }
    4352             : }
    4353             : 
    4354             : /************************************************************************/
    4355             : /*                        GetBestOverviewLevel()                        */
    4356             : /*                                                                      */
    4357             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4358             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    4359             : /* overview level                                                       */
    4360             : /************************************************************************/
    4361             : 
    4362           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    4363             :                                            int &nYOff, int &nXSize, int &nYSize,
    4364             :                                            int nBufXSize, int nBufYSize,
    4365             :                                            int nBandCount,
    4366             :                                            const int *panBandMap,
    4367             :                                            GDALRasterIOExtraArg *psExtraArg)
    4368             : {
    4369           4 :     int nOverviewCount = 0;
    4370           4 :     GDALRasterBand *poFirstBand = nullptr;
    4371             : 
    4372             :     /* -------------------------------------------------------------------- */
    4373             :     /* Check that all bands have the same number of overviews and           */
    4374             :     /* that they have all the same size and block dimensions                */
    4375             :     /* -------------------------------------------------------------------- */
    4376          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4377             :     {
    4378           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    4379           8 :         if (poBand == nullptr)
    4380           0 :             return -1;
    4381           8 :         if (iBand == 0)
    4382             :         {
    4383           4 :             poFirstBand = poBand;
    4384           4 :             nOverviewCount = poBand->GetOverviewCount();
    4385             :         }
    4386           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    4387             :         {
    4388           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4389             :                              "mismatched overview count, use std method.");
    4390           0 :             return -1;
    4391             :         }
    4392             :         else
    4393             :         {
    4394           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4395             :             {
    4396           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    4397             :                 GDALRasterBand *poOvrFirstBand =
    4398           0 :                     poFirstBand->GetOverview(iOverview);
    4399           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    4400           0 :                     continue;
    4401             : 
    4402           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    4403           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    4404             :                 {
    4405           0 :                     CPLDebug("GDAL",
    4406             :                              "GDALDataset::GetBestOverviewLevel() ... "
    4407             :                              "mismatched overview sizes, use std method.");
    4408           0 :                     return -1;
    4409             :                 }
    4410           0 :                 int nBlockXSizeFirst = 0;
    4411           0 :                 int nBlockYSizeFirst = 0;
    4412           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    4413             :                                              &nBlockYSizeFirst);
    4414             : 
    4415           0 :                 int nBlockXSizeCurrent = 0;
    4416           0 :                 int nBlockYSizeCurrent = 0;
    4417           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    4418             :                                         &nBlockYSizeCurrent);
    4419             : 
    4420           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    4421           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    4422             :                 {
    4423           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4424             :                                      "mismatched block sizes, use std method.");
    4425           0 :                     return -1;
    4426             :                 }
    4427             :             }
    4428             :         }
    4429             :     }
    4430           4 :     if (poFirstBand == nullptr)
    4431           0 :         return -1;
    4432             : 
    4433           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    4434             :                                          nYSize, nBufXSize, nBufYSize,
    4435           4 :                                          psExtraArg);
    4436             : }
    4437             : 
    4438             : /************************************************************************/
    4439             : /*                         BlockBasedRasterIO()                         */
    4440             : /*                                                                      */
    4441             : /*      This convenience function implements a dataset level            */
    4442             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    4443             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    4444             : /*      all bands at once, so that a format driver that handles a       */
    4445             : /*      request for different bands of the same block efficiently       */
    4446             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    4447             : /*                                                                      */
    4448             : /*      This method is intended to be called by an overridden           */
    4449             : /*      IRasterIO() method in the driver specific GDALDataset           */
    4450             : /*      derived class.                                                  */
    4451             : /*                                                                      */
    4452             : /*      Default internal implementation of RasterIO() ... utilizes      */
    4453             : /*      the Block access methods to satisfy the request.  This would    */
    4454             : /*      normally only be overridden by formats with overviews.          */
    4455             : /*                                                                      */
    4456             : /*      To keep things relatively simple, this method does not          */
    4457             : /*      currently take advantage of some special cases addressed in     */
    4458             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    4459             : /*      call it when you know it will help.  That is in cases where     */
    4460             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    4461             : /*      implementing interleaved IO efficiently on a block by block     */
    4462             : /*      basis. Overviews will be used when possible.                    */
    4463             : /************************************************************************/
    4464             : 
    4465       64099 : CPLErr GDALDataset::BlockBasedRasterIO(
    4466             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4467             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4468             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4469             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    4470             : 
    4471             : {
    4472       64099 :     CPLAssert(nullptr != pData);
    4473             : 
    4474       64099 :     GByte **papabySrcBlock = nullptr;
    4475       64099 :     GDALRasterBlock *poBlock = nullptr;
    4476       64099 :     GDALRasterBlock **papoBlocks = nullptr;
    4477       64099 :     int nLBlockX = -1;
    4478       64099 :     int nLBlockY = -1;
    4479             :     int iBufYOff;
    4480             :     int iBufXOff;
    4481       64099 :     int nBlockXSize = 1;
    4482       64099 :     int nBlockYSize = 1;
    4483       64099 :     CPLErr eErr = CE_None;
    4484       64099 :     GDALDataType eDataType = GDT_Byte;
    4485             : 
    4486       64099 :     const bool bUseIntegerRequestCoords =
    4487       64129 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4488          30 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4489          28 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4490             : 
    4491             :     /* -------------------------------------------------------------------- */
    4492             :     /*      Ensure that all bands share a common block size and data type.  */
    4493             :     /* -------------------------------------------------------------------- */
    4494      303865 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4495             :     {
    4496      239766 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4497             : 
    4498      239767 :         if (iBand == 0)
    4499             :         {
    4500       64097 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4501       64098 :             eDataType = poBand->GetRasterDataType();
    4502             :         }
    4503             :         else
    4504             :         {
    4505      175670 :             int nThisBlockXSize = 0;
    4506      175670 :             int nThisBlockYSize = 0;
    4507      175670 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    4508      175668 :             if (nThisBlockXSize != nBlockXSize ||
    4509      175668 :                 nThisBlockYSize != nBlockYSize)
    4510             :             {
    4511           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4512             :                                  "mismatched block sizes, use std method.");
    4513           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4514             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4515             :                                          nBandCount, panBandMap, nPixelSpace,
    4516           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4517             :             }
    4518             : 
    4519      175669 :             if (eDataType != poBand->GetRasterDataType() &&
    4520           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    4521             :             {
    4522           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4523             :                                  "mismatched band data types, use std method.");
    4524           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4525             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4526             :                                          nBandCount, panBandMap, nPixelSpace,
    4527           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4528             :             }
    4529             :         }
    4530             :     }
    4531             : 
    4532             :     /* ==================================================================== */
    4533             :     /*      In this special case at full resolution we step through in      */
    4534             :     /*      blocks, turning the request over to the per-band                */
    4535             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4536             :     /*      called before proceeding to the next.                           */
    4537             :     /* ==================================================================== */
    4538             : 
    4539       64099 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4540             :     {
    4541             :         GDALRasterIOExtraArg sDummyExtraArg;
    4542       64094 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4543             : 
    4544       64094 :         int nChunkYSize = 0;
    4545       64094 :         int nChunkXSize = 0;
    4546             : 
    4547      210624 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4548             :         {
    4549      147544 :             const int nChunkYOff = iBufYOff + nYOff;
    4550      147544 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4551      147544 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4552       59160 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4553             : 
    4554      818199 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4555             :             {
    4556      671667 :                 const int nChunkXOff = iBufXOff + nXOff;
    4557      671667 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4558      671667 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4559       70320 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4560             : 
    4561      671667 :                 GByte *pabyChunkData =
    4562      671667 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4563      671667 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4564             : 
    4565     3270470 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4566             :                 {
    4567     2599810 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4568             : 
    4569     5199630 :                     eErr = poBand->IRasterIO(
    4570             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4571             :                         nChunkYSize,
    4572     2599810 :                         pabyChunkData +
    4573     2599810 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4574             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4575     2599810 :                         nLineSpace, &sDummyExtraArg);
    4576     2599810 :                     if (eErr != CE_None)
    4577        1014 :                         return eErr;
    4578             :                 }
    4579             :             }
    4580             : 
    4581      165361 :             if (psExtraArg->pfnProgress != nullptr &&
    4582       18829 :                 !psExtraArg->pfnProgress(
    4583      165361 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4584             :                         nBufYSize,
    4585             :                     "", psExtraArg->pProgressData))
    4586             :             {
    4587           2 :                 return CE_Failure;
    4588             :             }
    4589             :         }
    4590             : 
    4591       63080 :         return CE_None;
    4592             :     }
    4593             : 
    4594             :     /* Below code is not compatible with that case. It would need a complete */
    4595             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    4596           5 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    4597             :     {
    4598           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4599             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4600             :                                  panBandMap, nPixelSpace, nLineSpace,
    4601           0 :                                  nBandSpace, psExtraArg);
    4602             :     }
    4603             : 
    4604             :     /* We could have a smarter implementation, but that will do for now */
    4605           5 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    4606           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    4607             :     {
    4608           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4609             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4610             :                                  panBandMap, nPixelSpace, nLineSpace,
    4611           0 :                                  nBandSpace, psExtraArg);
    4612             :     }
    4613             : 
    4614             :     /* ==================================================================== */
    4615             :     /*      Loop reading required source blocks to satisfy output           */
    4616             :     /*      request.  This is the most general implementation.              */
    4617             :     /* ==================================================================== */
    4618             : 
    4619           5 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    4620             : 
    4621             :     papabySrcBlock =
    4622           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    4623             :     papoBlocks =
    4624           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    4625             : 
    4626             :     /* -------------------------------------------------------------------- */
    4627             :     /*      Select an overview level if appropriate.                        */
    4628             :     /* -------------------------------------------------------------------- */
    4629             : 
    4630             :     GDALRasterIOExtraArg sExtraArg;
    4631           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4632           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    4633             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    4634             :         panBandMap, &sExtraArg);
    4635           4 :     if (nOverviewLevel >= 0)
    4636             :     {
    4637           2 :         GetRasterBand(panBandMap[0])
    4638           2 :             ->GetOverview(nOverviewLevel)
    4639           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4640             :     }
    4641             : 
    4642           4 :     double dfXOff = nXOff;
    4643           4 :     double dfYOff = nYOff;
    4644           4 :     double dfXSize = nXSize;
    4645           4 :     double dfYSize = nYSize;
    4646           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    4647             :     {
    4648           2 :         dfXOff = sExtraArg.dfXOff;
    4649           2 :         dfYOff = sExtraArg.dfYOff;
    4650           2 :         dfXSize = sExtraArg.dfXSize;
    4651           2 :         dfYSize = sExtraArg.dfYSize;
    4652             :     }
    4653             : 
    4654             :     /* -------------------------------------------------------------------- */
    4655             :     /*      Compute stepping increment.                                     */
    4656             :     /* -------------------------------------------------------------------- */
    4657           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    4658           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    4659             : 
    4660           4 :     constexpr double EPS = 1e-10;
    4661             :     /* -------------------------------------------------------------------- */
    4662             :     /*      Loop over buffer computing source locations.                    */
    4663             :     /* -------------------------------------------------------------------- */
    4664          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    4665             :     {
    4666             :         GPtrDiff_t iSrcOffset;
    4667             : 
    4668             :         // Add small epsilon to avoid some numeric precision issues.
    4669          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    4670          32 :         const int iSrcY = static_cast<int>(std::min(
    4671          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    4672             : 
    4673          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    4674             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    4675             : 
    4676         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    4677             :         {
    4678         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    4679         270 :             const int iSrcX = static_cast<int>(std::min(
    4680         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    4681             : 
    4682             :             // FIXME: this code likely doesn't work if the dirty block gets
    4683             :             // flushed to disk before being completely written. In the meantime,
    4684             :             // bJustInitialize should probably be set to FALSE even if it is not
    4685             :             // ideal performance wise, and for lossy compression
    4686             : 
    4687             :             /* --------------------------------------------------------------------
    4688             :              */
    4689             :             /*      Ensure we have the appropriate block loaded. */
    4690             :             /* --------------------------------------------------------------------
    4691             :              */
    4692         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    4693         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    4694         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    4695         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    4696             :             {
    4697           4 :                 nLBlockX = iSrcX / nBlockXSize;
    4698           4 :                 nLBlockY = iSrcY / nBlockYSize;
    4699             : 
    4700           4 :                 const bool bJustInitialize =
    4701           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    4702           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    4703           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    4704           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    4705             :                 /*bool bMemZeroBuffer = FALSE;
    4706             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    4707             :                     nXOff <= nLBlockX * nBlockXSize &&
    4708             :                     nYOff <= nLBlockY * nBlockYSize &&
    4709             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    4710             :                      (nXOff + nXSize == GetRasterXSize() &&
    4711             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    4712             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    4713             :                      (nYOff + nYSize == GetRasterYSize() &&
    4714             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    4715             :                 {
    4716             :                     bJustInitialize = TRUE;
    4717             :                     bMemZeroBuffer = TRUE;
    4718             :                 }*/
    4719          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4720             :                 {
    4721           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4722           8 :                     if (nOverviewLevel >= 0)
    4723           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    4724          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    4725           8 :                                                         bJustInitialize);
    4726           8 :                     if (poBlock == nullptr)
    4727             :                     {
    4728           0 :                         eErr = CE_Failure;
    4729           0 :                         goto CleanupAndReturn;
    4730             :                     }
    4731             : 
    4732           8 :                     if (eRWFlag == GF_Write)
    4733           0 :                         poBlock->MarkDirty();
    4734             : 
    4735           8 :                     if (papoBlocks[iBand] != nullptr)
    4736           0 :                         papoBlocks[iBand]->DropLock();
    4737             : 
    4738           8 :                     papoBlocks[iBand] = poBlock;
    4739             : 
    4740           8 :                     papabySrcBlock[iBand] =
    4741           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    4742             :                     /*if( bMemZeroBuffer )
    4743             :                     {
    4744             :                         memset(papabySrcBlock[iBand], 0,
    4745             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    4746             :                     * nBlockYSize);
    4747             :                     }*/
    4748             :                 }
    4749             :             }
    4750             : 
    4751             :             /* --------------------------------------------------------------------
    4752             :              */
    4753             :             /*      Copy over this pixel of data. */
    4754             :             /* --------------------------------------------------------------------
    4755             :              */
    4756         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    4757         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    4758         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    4759         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    4760         270 :                               nBlockXSize) *
    4761         270 :                          nBandDataSize;
    4762             : 
    4763         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    4764             :             {
    4765         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    4766         710 :                 GPtrDiff_t iBandBufOffset =
    4767         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    4768             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    4769             : 
    4770         710 :                 if (eDataType == eBufType)
    4771             :                 {
    4772         710 :                     if (eRWFlag == GF_Read)
    4773         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    4774         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    4775             :                     else
    4776           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    4777             :                                static_cast<const GByte *>(pData) +
    4778           0 :                                    iBandBufOffset,
    4779             :                                nBandDataSize);
    4780             :                 }
    4781             :                 else
    4782             :                 {
    4783             :                     /* type to type conversion ... ouch, this is expensive way
    4784             :                        of handling single words */
    4785             : 
    4786           0 :                     if (eRWFlag == GF_Read)
    4787           0 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
    4788             :                                         static_cast<GByte *>(pData) +
    4789           0 :                                             iBandBufOffset,
    4790             :                                         eBufType, 0, 1);
    4791             :                     else
    4792           0 :                         GDALCopyWords64(static_cast<const GByte *>(pData) +
    4793           0 :                                             iBandBufOffset,
    4794           0 :                                         eBufType, 0, pabySrcBlock + iSrcOffset,
    4795             :                                         eDataType, 0, 1);
    4796             :                 }
    4797             :             }
    4798             : 
    4799         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    4800             :         }
    4801             :     }
    4802             : 
    4803             :     /* -------------------------------------------------------------------- */
    4804             :     /*      CleanupAndReturn.                                               */
    4805             :     /* -------------------------------------------------------------------- */
    4806           4 : CleanupAndReturn:
    4807           4 :     CPLFree(papabySrcBlock);
    4808           4 :     if (papoBlocks != nullptr)
    4809             :     {
    4810          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    4811             :         {
    4812           8 :             if (papoBlocks[iBand] != nullptr)
    4813           8 :                 papoBlocks[iBand]->DropLock();
    4814             :         }
    4815           4 :         CPLFree(papoBlocks);
    4816             :     }
    4817             : 
    4818           4 :     return eErr;
    4819             : }
    4820             : 
    4821             : //! @endcond
    4822             : 
    4823             : /************************************************************************/
    4824             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    4825             : /************************************************************************/
    4826             : 
    4827        3266 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    4828             :                                             GDALRasterBand *poDstPrototypeBand,
    4829             :                                             int nBandCount,
    4830             :                                             int bDstIsCompressed,
    4831             :                                             int bInterleave, int *pnSwathCols,
    4832             :                                             int *pnSwathLines)
    4833             : {
    4834        3266 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4835        3266 :     int nSrcBlockXSize = 0;
    4836        3266 :     int nSrcBlockYSize = 0;
    4837        3266 :     int nBlockXSize = 0;
    4838        3266 :     int nBlockYSize = 0;
    4839             : 
    4840        3266 :     int nXSize = poSrcPrototypeBand->GetXSize();
    4841        3266 :     int nYSize = poSrcPrototypeBand->GetYSize();
    4842             : 
    4843        3266 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    4844        3266 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4845             : 
    4846        3266 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    4847        3266 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    4848             : 
    4849        3266 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4850        3266 :     if (bInterleave)
    4851         555 :         nPixelSize *= nBandCount;
    4852             : 
    4853             :     // aim for one row of blocks.  Do not settle for less.
    4854        3266 :     int nSwathCols = nXSize;
    4855        3266 :     int nSwathLines = nMaxBlockYSize;
    4856             : 
    4857             :     const char *pszSrcCompression =
    4858        3266 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4859        3266 :     if (pszSrcCompression == nullptr)
    4860             :     {
    4861        3240 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    4862        3240 :         if (poSrcDS)
    4863             :             pszSrcCompression =
    4864        3234 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4865             :     }
    4866             : 
    4867             :     /* -------------------------------------------------------------------- */
    4868             :     /*      What will our swath size be?                                    */
    4869             :     /* -------------------------------------------------------------------- */
    4870             :     // When writing interleaved data in a compressed format, we want to be sure
    4871             :     // that each block will only be written once, so the swath size must not be
    4872             :     // greater than the block cache.
    4873        3266 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    4874             :     int nTargetSwathSize;
    4875        3266 :     if (pszSwathSize != nullptr)
    4876           0 :         nTargetSwathSize = static_cast<int>(
    4877           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    4878             :     else
    4879             :     {
    4880             :         // As a default, take one 1/4 of the cache size.
    4881        3266 :         nTargetSwathSize = static_cast<int>(
    4882        3266 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    4883             : 
    4884             :         // but if the minimum idal swath buf size is less, then go for it to
    4885             :         // avoid unnecessarily abusing RAM usage.
    4886             :         // but try to use 10 MB at least.
    4887        3266 :         GIntBig nIdealSwathBufSize =
    4888        3266 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    4889        3266 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    4890             : 
    4891        3266 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    4892        3266 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    4893             :         {
    4894           1 :             nMinTargetSwathSize = nTargetSwathSize;
    4895             :         }
    4896             : 
    4897        3266 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    4898        3256 :             nIdealSwathBufSize < nMinTargetSwathSize)
    4899             :         {
    4900        3253 :             nIdealSwathBufSize = nMinTargetSwathSize;
    4901             :         }
    4902             : 
    4903        3266 :         if (pszSrcCompression != nullptr &&
    4904         182 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    4905           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    4906           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    4907             :         {
    4908           2 :             nIdealSwathBufSize =
    4909           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    4910           2 :                                                  nSrcBlockYSize * nPixelSize);
    4911             :         }
    4912        3266 :         if (nTargetSwathSize > nIdealSwathBufSize)
    4913        3253 :             nTargetSwathSize = static_cast<int>(
    4914        3253 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    4915             :     }
    4916             : 
    4917        3266 :     if (nTargetSwathSize < 1000000)
    4918           8 :         nTargetSwathSize = 1000000;
    4919             : 
    4920             :     /* But let's check that  */
    4921        3487 :     if (bDstIsCompressed && bInterleave &&
    4922         221 :         nTargetSwathSize > GDALGetCacheMax64())
    4923             :     {
    4924           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4925             :                  "When translating into a compressed interleave format, "
    4926             :                  "the block cache size (" CPL_FRMT_GIB ") "
    4927             :                  "should be at least the size of the swath (%d) "
    4928             :                  "(GDAL_SWATH_SIZE config. option)",
    4929             :                  GDALGetCacheMax64(), nTargetSwathSize);
    4930             :     }
    4931             : 
    4932             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    4933             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    4934             : 
    4935             :     // if both input and output datasets are tiled, that the tile dimensions
    4936             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    4937             :     // of input and output block dimensions.
    4938        3266 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    4939          39 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    4940          39 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    4941          39 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4942          39 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4943             :     {
    4944          39 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    4945          39 :                 nPixelSize <=
    4946          39 :             static_cast<GIntBig>(nTargetSwathSize))
    4947             :         {
    4948          39 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    4949          39 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    4950          39 :             if (nSwathCols == 0)
    4951           0 :                 nSwathCols = nMaxBlockXSize;
    4952          39 :             if (nSwathCols > nXSize)
    4953          37 :                 nSwathCols = nXSize;
    4954          39 :             nSwathLines = nMaxBlockYSize;
    4955             : 
    4956          39 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    4957          39 :                 static_cast<GIntBig>(nTargetSwathSize))
    4958             :             {
    4959           0 :                 nSwathCols = nXSize;
    4960           0 :                 nSwathLines = nBlockYSize;
    4961             :             }
    4962             :         }
    4963             :     }
    4964             : 
    4965        3266 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    4966        3266 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    4967        3266 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    4968             :     {
    4969           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    4970           1 :         if (nSwathLines == 0)
    4971           1 :             nSwathLines = 1;
    4972             : 
    4973           1 :         CPLDebug(
    4974             :             "GDAL",
    4975             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    4976             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    4977             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    4978           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    4979             :     }
    4980             :     // If we are processing single scans, try to handle several at once.
    4981             :     // If we are handling swaths already, only grow the swath if a row
    4982             :     // of blocks is substantially less than our target buffer size.
    4983        3265 :     else if (nSwathLines == 1 ||
    4984        2716 :              nMemoryPerCol * nSwathLines <
    4985        2716 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    4986             :     {
    4987        3237 :         nSwathLines = std::min(
    4988             :             nYSize,
    4989        3237 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    4990             : 
    4991             :         /* If possible try to align to source and target block height */
    4992        3237 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    4993         257 :             nSwathLines > nMaxBlockYSize &&
    4994         257 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4995         228 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4996         206 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    4997             :     }
    4998             : 
    4999        3266 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    5000           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    5001           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    5002             :     {
    5003             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    5004           2 :         if (nSwathLines < nSrcBlockYSize)
    5005             :         {
    5006           0 :             nSwathLines = nSrcBlockYSize;
    5007             : 
    5008             :             // Number of pixels that can be read/write simultaneously.
    5009           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    5010           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    5011           0 :             if (nSwathCols == 0)
    5012           0 :                 nSwathCols = nSrcBlockXSize;
    5013           0 :             if (nSwathCols > nXSize)
    5014           0 :                 nSwathCols = nXSize;
    5015             : 
    5016           0 :             CPLDebug(
    5017             :                 "GDAL",
    5018             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5019             :                 "too high block, "
    5020             :                 "use partial width at one time");
    5021             :         }
    5022           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    5023             :         {
    5024             :             /* Round on a multiple of nSrcBlockYSize */
    5025           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    5026           0 :             CPLDebug(
    5027             :                 "GDAL",
    5028             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5029             :                 "round nSwathLines to block height : %d",
    5030             :                 nSwathLines);
    5031             :         }
    5032             :     }
    5033        3264 :     else if (bDstIsCompressed)
    5034             :     {
    5035         412 :         if (nSwathLines < nBlockYSize)
    5036             :         {
    5037         146 :             nSwathLines = nBlockYSize;
    5038             : 
    5039             :             // Number of pixels that can be read/write simultaneously.
    5040         146 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    5041         146 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    5042         146 :             if (nSwathCols == 0)
    5043           0 :                 nSwathCols = nBlockXSize;
    5044         146 :             if (nSwathCols > nXSize)
    5045         146 :                 nSwathCols = nXSize;
    5046             : 
    5047         146 :             CPLDebug(
    5048             :                 "GDAL",
    5049             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5050             :                 "too high block, "
    5051             :                 "use partial width at one time");
    5052             :         }
    5053         266 :         else if ((nSwathLines % nBlockYSize) != 0)
    5054             :         {
    5055             :             // Round on a multiple of nBlockYSize.
    5056           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    5057           9 :             CPLDebug(
    5058             :                 "GDAL",
    5059             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5060             :                 "round nSwathLines to block height : %d",
    5061             :                 nSwathLines);
    5062             :         }
    5063             :     }
    5064             : 
    5065        3266 :     *pnSwathCols = nSwathCols;
    5066        3266 :     *pnSwathLines = nSwathLines;
    5067        3266 : }
    5068             : 
    5069             : /************************************************************************/
    5070             : /*                     GDALDatasetCopyWholeRaster()                     */
    5071             : /************************************************************************/
    5072             : 
    5073             : /**
    5074             :  * \brief Copy all dataset raster data.
    5075             :  *
    5076             :  * This function copies the complete raster contents of one dataset to
    5077             :  * another similarly configured dataset.  The source and destination
    5078             :  * dataset must have the same number of bands, and the same width
    5079             :  * and height.  The bands do not have to have the same data type.
    5080             :  *
    5081             :  * This function is primarily intended to support implementation of
    5082             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    5083             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    5084             :  * performing the transfer in a pixel interleaved fashion.
    5085             :  *
    5086             :  * Currently the only papszOptions value supported are :
    5087             :  * <ul>
    5088             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    5089             :  * write access pattern (this does not modify the layout of the destination
    5090             :  * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
    5091             :  * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
    5092             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    5093             :  * (GDAL &gt;= 2.2)</li>
    5094             :  * </ul>
    5095             :  * More options may be supported in the future.
    5096             :  *
    5097             :  * @param hSrcDS the source dataset
    5098             :  * @param hDstDS the destination dataset
    5099             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5100             :  * @param pfnProgress progress reporting function.
    5101             :  * @param pProgressData callback data for progress function.
    5102             :  *
    5103             :  * @return CE_None on success, or CE_Failure on failure.
    5104             :  */
    5105             : 
    5106        3238 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    5107             :                                               GDALDatasetH hDstDS,
    5108             :                                               CSLConstList papszOptions,
    5109             :                                               GDALProgressFunc pfnProgress,
    5110             :                                               void *pProgressData)
    5111             : 
    5112             : {
    5113        3238 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5114        3238 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5115             : 
    5116        3238 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    5117        3238 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    5118             : 
    5119        3238 :     if (pfnProgress == nullptr)
    5120           0 :         pfnProgress = GDALDummyProgress;
    5121             : 
    5122             :     /* -------------------------------------------------------------------- */
    5123             :     /*      Confirm the datasets match in size and band counts.             */
    5124             :     /* -------------------------------------------------------------------- */
    5125        3238 :     const int nXSize = poDstDS->GetRasterXSize();
    5126        3238 :     const int nYSize = poDstDS->GetRasterYSize();
    5127        3238 :     const int nBandCount = poDstDS->GetRasterCount();
    5128             : 
    5129        3238 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    5130        6476 :         poSrcDS->GetRasterYSize() != nYSize ||
    5131        3238 :         poSrcDS->GetRasterCount() != nBandCount)
    5132             :     {
    5133           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5134             :                  "Input and output dataset sizes or band counts do not\n"
    5135             :                  "match in GDALDatasetCopyWholeRaster()");
    5136           0 :         return CE_Failure;
    5137             :     }
    5138             : 
    5139             :     /* -------------------------------------------------------------------- */
    5140             :     /*      Report preliminary (0) progress.                                */
    5141             :     /* -------------------------------------------------------------------- */
    5142        3238 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5143             :     {
    5144           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5145             :                  "User terminated CreateCopy()");
    5146           1 :         return CE_Failure;
    5147             :     }
    5148             : 
    5149             :     /* -------------------------------------------------------------------- */
    5150             :     /*      Get our prototype band, and assume the others are similarly     */
    5151             :     /*      configured.                                                     */
    5152             :     /* -------------------------------------------------------------------- */
    5153        3237 :     if (nBandCount == 0)
    5154           0 :         return CE_None;
    5155             : 
    5156        3237 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    5157        3237 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    5158        3237 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5159             : 
    5160             :     /* -------------------------------------------------------------------- */
    5161             :     /*      Do we want to try and do the operation in a pixel               */
    5162             :     /*      interleaved fashion?                                            */
    5163             :     /* -------------------------------------------------------------------- */
    5164        3237 :     bool bInterleave = false;
    5165             :     const char *pszInterleave =
    5166        3237 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5167        3237 :     if (pszInterleave != nullptr &&
    5168        2848 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5169         188 :         bInterleave = true;
    5170             : 
    5171        3237 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5172        3237 :     if (pszInterleave != nullptr &&
    5173        2770 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5174         502 :         bInterleave = true;
    5175             : 
    5176        3237 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    5177        3237 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    5178           5 :         bInterleave = true;
    5179        3232 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    5180          13 :         bInterleave = false;
    5181             :     // attributes is specific to the TileDB driver
    5182        3219 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    5183           4 :         bInterleave = true;
    5184        3215 :     else if (pszInterleave != nullptr)
    5185             :     {
    5186           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    5187             :                  "Unsupported value for option INTERLEAVE");
    5188             :     }
    5189             : 
    5190             :     // If the destination is compressed, we must try to write blocks just once,
    5191             :     // to save disk space (GTiff case for example), and to avoid data loss
    5192             :     // (JPEG compression for example).
    5193        3237 :     bool bDstIsCompressed = false;
    5194             :     const char *pszDstCompressed =
    5195        3237 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    5196        3237 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5197         386 :         bDstIsCompressed = true;
    5198             : 
    5199             :     /* -------------------------------------------------------------------- */
    5200             :     /*      What will our swath size be?                                    */
    5201             :     /* -------------------------------------------------------------------- */
    5202             : 
    5203        3237 :     int nSwathCols = 0;
    5204        3237 :     int nSwathLines = 0;
    5205        3237 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    5206             :                                     nBandCount, bDstIsCompressed, bInterleave,
    5207             :                                     &nSwathCols, &nSwathLines);
    5208             : 
    5209        3237 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5210        3237 :     if (bInterleave)
    5211         555 :         nPixelSize *= nBandCount;
    5212             : 
    5213        3237 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5214        3237 :     if (pSwathBuf == nullptr)
    5215             :     {
    5216           0 :         return CE_Failure;
    5217             :     }
    5218             : 
    5219        3237 :     CPLDebug("GDAL",
    5220             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    5221             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    5222             : 
    5223             :     // Advise the source raster that we are going to read it completely
    5224             :     // Note: this might already have been done by GDALCreateCopy() in the
    5225             :     // likely case this function is indirectly called by it
    5226        3237 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    5227        3237 :                         nullptr, nullptr);
    5228             : 
    5229             :     /* ==================================================================== */
    5230             :     /*      Band oriented (uninterleaved) case.                             */
    5231             :     /* ==================================================================== */
    5232        3237 :     CPLErr eErr = CE_None;
    5233             :     const bool bCheckHoles =
    5234        3237 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5235             : 
    5236        3237 :     if (!bInterleave)
    5237             :     {
    5238             :         GDALRasterIOExtraArg sExtraArg;
    5239        2682 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5240        2682 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5241             : 
    5242        8046 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    5243        2682 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    5244        2682 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    5245        2682 :         GIntBig nBlocksDone = 0;
    5246             : 
    5247        7794 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    5248             :         {
    5249        5112 :             int nBand = iBand + 1;
    5250             : 
    5251       10499 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5252             :             {
    5253        5387 :                 int nThisLines = nSwathLines;
    5254             : 
    5255        5387 :                 if (iY + nThisLines > nYSize)
    5256         380 :                     nThisLines = nYSize - iY;
    5257             : 
    5258       10774 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    5259        5387 :                      iX += nSwathCols)
    5260             :                 {
    5261        5387 :                     int nThisCols = nSwathCols;
    5262             : 
    5263        5387 :                     if (iX + nThisCols > nXSize)
    5264           0 :                         nThisCols = nXSize - iX;
    5265             : 
    5266        5387 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5267        5387 :                     if (bCheckHoles)
    5268             :                     {
    5269             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    5270        3698 :                                       ->GetDataCoverageStatus(
    5271             :                                           iX, iY, nThisCols, nThisLines,
    5272             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    5273             :                     }
    5274        5387 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5275             :                     {
    5276        5383 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    5277       10766 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    5278        5383 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    5279        5383 :                             (nBlocksDone + 0.5) /
    5280        5383 :                                 static_cast<double>(nTotalBlocks),
    5281             :                             pfnProgress, pProgressData);
    5282        5383 :                         if (sExtraArg.pProgressData == nullptr)
    5283        1659 :                             sExtraArg.pfnProgress = nullptr;
    5284             : 
    5285        5383 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5286             :                                                  nThisLines, pSwathBuf,
    5287             :                                                  nThisCols, nThisLines, eDT, 1,
    5288             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    5289             : 
    5290        5383 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5291             : 
    5292        5383 :                         if (eErr == CE_None)
    5293        5376 :                             eErr = poDstDS->RasterIO(
    5294             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    5295             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    5296             :                                 &nBand, 0, 0, 0, nullptr);
    5297             :                     }
    5298             : 
    5299        5387 :                     nBlocksDone++;
    5300       10732 :                     if (eErr == CE_None &&
    5301        5345 :                         !pfnProgress(nBlocksDone /
    5302        5345 :                                          static_cast<double>(nTotalBlocks),
    5303             :                                      nullptr, pProgressData))
    5304             :                     {
    5305           2 :                         eErr = CE_Failure;
    5306           2 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    5307             :                                  "User terminated CreateCopy()");
    5308             :                     }
    5309             :                 }
    5310             :             }
    5311             :         }
    5312             :     }
    5313             : 
    5314             :     /* ==================================================================== */
    5315             :     /*      Pixel interleaved case.                                         */
    5316             :     /* ==================================================================== */
    5317             :     else /* if( bInterleave ) */
    5318             :     {
    5319             :         GDALRasterIOExtraArg sExtraArg;
    5320         555 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5321         555 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5322             : 
    5323         555 :         const GIntBig nTotalBlocks =
    5324         555 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    5325         555 :             DIV_ROUND_UP(nXSize, nSwathCols);
    5326         555 :         GIntBig nBlocksDone = 0;
    5327             : 
    5328        1328 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5329             :         {
    5330         773 :             int nThisLines = nSwathLines;
    5331             : 
    5332         773 :             if (iY + nThisLines > nYSize)
    5333         194 :                 nThisLines = nYSize - iY;
    5334             : 
    5335        1551 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5336             :             {
    5337         778 :                 int nThisCols = nSwathCols;
    5338             : 
    5339         778 :                 if (iX + nThisCols > nXSize)
    5340           3 :                     nThisCols = nXSize - iX;
    5341             : 
    5342         778 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5343         778 :                 if (bCheckHoles)
    5344             :                 {
    5345         547 :                     nStatus = 0;
    5346         600 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    5347             :                     {
    5348         581 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    5349         581 :                                        ->GetDataCoverageStatus(
    5350             :                                            iX, iY, nThisCols, nThisLines,
    5351             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    5352         581 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5353         528 :                             break;
    5354             :                     }
    5355             :                 }
    5356         778 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5357             :                 {
    5358         759 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    5359        1518 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    5360         759 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    5361         759 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    5362             :                         pfnProgress, pProgressData);
    5363         759 :                     if (sExtraArg.pProgressData == nullptr)
    5364         346 :                         sExtraArg.pfnProgress = nullptr;
    5365             : 
    5366         759 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5367             :                                              nThisLines, pSwathBuf, nThisCols,
    5368             :                                              nThisLines, eDT, nBandCount,
    5369             :                                              nullptr, 0, 0, 0, &sExtraArg);
    5370             : 
    5371         759 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5372             : 
    5373         759 :                     if (eErr == CE_None)
    5374         758 :                         eErr = poDstDS->RasterIO(
    5375             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    5376             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    5377             :                             0, 0, nullptr);
    5378             :                 }
    5379             : 
    5380         778 :                 nBlocksDone++;
    5381        1552 :                 if (eErr == CE_None &&
    5382         774 :                     !pfnProgress(nBlocksDone /
    5383         774 :                                      static_cast<double>(nTotalBlocks),
    5384             :                                  nullptr, pProgressData))
    5385             :                 {
    5386           1 :                     eErr = CE_Failure;
    5387           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    5388             :                              "User terminated CreateCopy()");
    5389             :                 }
    5390             :             }
    5391             :         }
    5392             :     }
    5393             : 
    5394             :     /* -------------------------------------------------------------------- */
    5395             :     /*      Cleanup                                                         */
    5396             :     /* -------------------------------------------------------------------- */
    5397        3237 :     CPLFree(pSwathBuf);
    5398             : 
    5399        3237 :     return eErr;
    5400             : }
    5401             : 
    5402             : /************************************************************************/
    5403             : /*                     GDALRasterBandCopyWholeRaster()                  */
    5404             : /************************************************************************/
    5405             : 
    5406             : /**
    5407             :  * \brief Copy a whole raster band
    5408             :  *
    5409             :  * This function copies the complete raster contents of one band to
    5410             :  * another similarly configured band.  The source and destination
    5411             :  * bands must have the same width and height.  The bands do not have
    5412             :  * to have the same data type.
    5413             :  *
    5414             :  * It implements efficient copying, in particular "chunking" the copy in
    5415             :  * substantial blocks.
    5416             :  *
    5417             :  * Currently the only papszOptions value supported are :
    5418             :  * <ul>
    5419             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    5420             :  * achieve best compression.</li>
    5421             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    5422             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    5423             :  * </ul>
    5424             :  *
    5425             :  * @param hSrcBand the source band
    5426             :  * @param hDstBand the destination band
    5427             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5428             :  * @param pfnProgress progress reporting function.
    5429             :  * @param pProgressData callback data for progress function.
    5430             :  *
    5431             :  * @return CE_None on success, or CE_Failure on failure.
    5432             :  */
    5433             : 
    5434          29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    5435             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    5436             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    5437             :     void *pProgressData)
    5438             : 
    5439             : {
    5440          29 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5441          29 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5442             : 
    5443          29 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    5444          29 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    5445          29 :     CPLErr eErr = CE_None;
    5446             : 
    5447          29 :     if (pfnProgress == nullptr)
    5448           2 :         pfnProgress = GDALDummyProgress;
    5449             : 
    5450             :     /* -------------------------------------------------------------------- */
    5451             :     /*      Confirm the datasets match in size and band counts.             */
    5452             :     /* -------------------------------------------------------------------- */
    5453          29 :     int nXSize = poSrcBand->GetXSize();
    5454          29 :     int nYSize = poSrcBand->GetYSize();
    5455             : 
    5456          29 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    5457             :     {
    5458           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5459             :                  "Input and output band sizes do not\n"
    5460             :                  "match in GDALRasterBandCopyWholeRaster()");
    5461           0 :         return CE_Failure;
    5462             :     }
    5463             : 
    5464             :     /* -------------------------------------------------------------------- */
    5465             :     /*      Report preliminary (0) progress.                                */
    5466             :     /* -------------------------------------------------------------------- */
    5467          29 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5468             :     {
    5469           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5470             :                  "User terminated CreateCopy()");
    5471           0 :         return CE_Failure;
    5472             :     }
    5473             : 
    5474          29 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5475             : 
    5476             :     // If the destination is compressed, we must try to write blocks just once,
    5477             :     // to save disk space (GTiff case for example), and to avoid data loss
    5478             :     // (JPEG compression for example).
    5479          29 :     bool bDstIsCompressed = false;
    5480             :     const char *pszDstCompressed =
    5481          29 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5482          29 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5483          26 :         bDstIsCompressed = true;
    5484             : 
    5485             :     /* -------------------------------------------------------------------- */
    5486             :     /*      What will our swath size be?                                    */
    5487             :     /* -------------------------------------------------------------------- */
    5488             : 
    5489          29 :     int nSwathCols = 0;
    5490          29 :     int nSwathLines = 0;
    5491          29 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5492             :                                     FALSE, &nSwathCols, &nSwathLines);
    5493             : 
    5494          29 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5495             : 
    5496          29 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5497          29 :     if (pSwathBuf == nullptr)
    5498             :     {
    5499           0 :         return CE_Failure;
    5500             :     }
    5501             : 
    5502          29 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5503             :              nSwathCols, nSwathLines);
    5504             : 
    5505             :     const bool bCheckHoles =
    5506          29 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5507             : 
    5508             :     // Advise the source raster that we are going to read it completely
    5509          29 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    5510             : 
    5511             :     /* ==================================================================== */
    5512             :     /*      Band oriented (uninterleaved) case.                             */
    5513             :     /* ==================================================================== */
    5514             : 
    5515          72 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5516             :     {
    5517          43 :         int nThisLines = nSwathLines;
    5518             : 
    5519          43 :         if (iY + nThisLines > nYSize)
    5520           8 :             nThisLines = nYSize - iY;
    5521             : 
    5522          86 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5523             :         {
    5524          43 :             int nThisCols = nSwathCols;
    5525             : 
    5526          43 :             if (iX + nThisCols > nXSize)
    5527           0 :                 nThisCols = nXSize - iX;
    5528             : 
    5529          43 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5530          43 :             if (bCheckHoles)
    5531             :             {
    5532           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5533             :                     iX, iY, nThisCols, nThisLines,
    5534             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5535             :             }
    5536          43 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5537             :             {
    5538          43 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5539             :                                            nThisLines, pSwathBuf, nThisCols,
    5540             :                                            nThisLines, eDT, 0, 0, nullptr);
    5541             : 
    5542          43 :                 if (eErr == CE_None)
    5543          43 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5544             :                                                nThisLines, pSwathBuf, nThisCols,
    5545             :                                                nThisLines, eDT, 0, 0, nullptr);
    5546             :             }
    5547             : 
    5548          86 :             if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
    5549          43 :                                                     static_cast<double>(nYSize),
    5550             :                                                 nullptr, pProgressData))
    5551             :             {
    5552           0 :                 eErr = CE_Failure;
    5553           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5554             :                          "User terminated CreateCopy()");
    5555             :             }
    5556             :         }
    5557             :     }
    5558             : 
    5559             :     /* -------------------------------------------------------------------- */
    5560             :     /*      Cleanup                                                         */
    5561             :     /* -------------------------------------------------------------------- */
    5562          29 :     CPLFree(pSwathBuf);
    5563             : 
    5564          29 :     return eErr;
    5565             : }
    5566             : 
    5567             : /************************************************************************/
    5568             : /*                      GDALCopyRasterIOExtraArg ()                     */
    5569             : /************************************************************************/
    5570             : 
    5571      527171 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5572             :                               GDALRasterIOExtraArg *psSrcArg)
    5573             : {
    5574      527171 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5575      527171 :     if (psSrcArg)
    5576             :     {
    5577      527171 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5578      527171 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5579      527171 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5580      527171 :         psDestArg->bFloatingPointWindowValidity =
    5581      527171 :             psSrcArg->bFloatingPointWindowValidity;
    5582      527171 :         if (psSrcArg->bFloatingPointWindowValidity)
    5583             :         {
    5584      204370 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5585      204370 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5586      204370 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5587      204370 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5588             :         }
    5589      527171 :         if (psSrcArg->nVersion >= 2)
    5590             :         {
    5591      527171 :             psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
    5592             :         }
    5593             :     }
    5594      527171 : }
    5595             : 
    5596             : /************************************************************************/
    5597             : /*                         HasOnlyNoData()                              */
    5598             : /************************************************************************/
    5599             : 
    5600    25108002 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    5601             : {
    5602    25108002 :     return value == noDataValue;
    5603             : }
    5604             : 
    5605           0 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
    5606             : {
    5607             :     using std::isnan;
    5608           0 :     return isnan(noDataValue) ? isnan(value) : value == noDataValue;
    5609             : }
    5610             : 
    5611      625506 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    5612             : {
    5613      625506 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5614             : }
    5615             : 
    5616    13546900 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    5617             : {
    5618    13546900 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5619             : }
    5620             : 
    5621             : template <class T>
    5622       15894 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    5623             :                            size_t nHeight, size_t nLineStride,
    5624             :                            size_t nComponents)
    5625             : {
    5626             :     // Fast test: check the 4 corners and the middle pixel.
    5627       30885 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    5628             :     {
    5629       32537 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    5630       16181 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    5631       15957 :                               noDataValue) &&
    5632       15957 :               IsEqualToNoData(
    5633       15957 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    5634       15957 :                               nComponents +
    5635             :                           iBand],
    5636       15004 :                   noDataValue) &&
    5637       15004 :               IsEqualToNoData(
    5638       15004 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    5639             :                   noDataValue) &&
    5640       14996 :               IsEqualToNoData(
    5641       14996 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    5642       14996 :                               nComponents +
    5643             :                           iBand],
    5644             :                   noDataValue)))
    5645             :         {
    5646        1365 :             return false;
    5647             :         }
    5648             :     }
    5649             : 
    5650             :     // Test all pixels.
    5651       46061 :     for (size_t iY = 0; iY < nHeight; iY++)
    5652             :     {
    5653       31590 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    5654    39233392 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    5655             :         {
    5656    39201946 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    5657             :             {
    5658          58 :                 return false;
    5659             :             }
    5660             :         }
    5661             :     }
    5662       14471 :     return true;
    5663             : }
    5664             : 
    5665             : /************************************************************************/
    5666             : /*                    GDALBufferHasOnlyNoData()                         */
    5667             : /************************************************************************/
    5668             : 
    5669       42608 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    5670             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    5671             :                              size_t nComponents, int nBitsPerSample,
    5672             :                              GDALBufferSampleFormat nSampleFormat)
    5673             : {
    5674             :     // In the case where the nodata is 0, we can compare several bytes at
    5675             :     // once. Select the largest natural integer type for the architecture.
    5676             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    5677             :     // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    5678             :     typedef std::uint64_t WordType;
    5679             : #else
    5680             :     typedef std::uint32_t WordType;
    5681             : #endif
    5682       42608 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    5683             :         // Do not use this optimized code path for floating point numbers,
    5684             :         // as it can't detect negative zero.
    5685             :         nSampleFormat != GSF_FLOATING_POINT)
    5686             :     {
    5687       26708 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    5688       26708 :         const size_t nSize =
    5689       26708 :             (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
    5690       26708 :         size_t i = 0;
    5691             :         const size_t nInitialIters =
    5692       53416 :             std::min(sizeof(WordType) -
    5693       26708 :                          static_cast<size_t>(
    5694             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    5695             :                              sizeof(WordType)),
    5696       26708 :                      nSize);
    5697      220229 :         for (; i < nInitialIters; i++)
    5698             :         {
    5699      197907 :             if (pabyBuffer[i])
    5700        4386 :                 return false;
    5701             :         }
    5702    16495900 :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    5703             :         {
    5704    16480800 :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    5705        7193 :                 return false;
    5706             :         }
    5707       52518 :         for (; i < nSize; i++)
    5708             :         {
    5709       37394 :             if (pabyBuffer[i])
    5710           5 :                 return false;
    5711             :         }
    5712       15124 :         return true;
    5713             :     }
    5714             : 
    5715       15900 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    5716             :     {
    5717       22272 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    5718       11136 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    5719       11136 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    5720       11136 :                               nHeight, nLineStride, nComponents);
    5721             :     }
    5722        4764 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    5723             :     {
    5724             :         // Use unsigned implementation by converting the nodatavalue to
    5725             :         // unsigned
    5726          63 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    5727          31 :                HasOnlyNoDataT(
    5728             :                    static_cast<const uint8_t *>(pBuffer),
    5729          31 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    5730          32 :                    nWidth, nHeight, nLineStride, nComponents);
    5731             :     }
    5732        4732 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    5733             :     {
    5734          23 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    5735          11 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    5736          11 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    5737          12 :                               nHeight, nLineStride, nComponents);
    5738             :     }
    5739        4720 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    5740             :     {
    5741             :         // Use unsigned implementation by converting the nodatavalue to
    5742             :         // unsigned
    5743          99 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    5744          49 :                HasOnlyNoDataT(
    5745             :                    static_cast<const uint16_t *>(pBuffer),
    5746          49 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    5747          50 :                    nWidth, nHeight, nLineStride, nComponents);
    5748             :     }
    5749        4670 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    5750             :     {
    5751          73 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    5752          36 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    5753             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    5754          37 :                               nHeight, nLineStride, nComponents);
    5755             :     }
    5756        4633 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    5757             :     {
    5758             :         // Use unsigned implementation by converting the nodatavalue to
    5759             :         // unsigned
    5760          23 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    5761          11 :                HasOnlyNoDataT(
    5762             :                    static_cast<const uint32_t *>(pBuffer),
    5763          11 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    5764          12 :                    nWidth, nHeight, nLineStride, nComponents);
    5765             :     }
    5766        4621 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    5767             :     {
    5768          56 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    5769          28 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    5770             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    5771          28 :                               nHeight, nLineStride, nComponents);
    5772             :     }
    5773        4593 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    5774             :     {
    5775             :         // Use unsigned implementation by converting the nodatavalue to
    5776             :         // unsigned
    5777           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    5778           0 :                HasOnlyNoDataT(
    5779             :                    static_cast<const uint64_t *>(pBuffer),
    5780           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    5781           0 :                    nWidth, nHeight, nLineStride, nComponents);
    5782             :     }
    5783        4593 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
    5784             :     {
    5785           0 :         return (std::isnan(dfNoDataValue) ||
    5786           0 :                 GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
    5787           0 :                HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
    5788             :                               static_cast<GFloat16>(dfNoDataValue), nWidth,
    5789           0 :                               nHeight, nLineStride, nComponents);
    5790             :     }
    5791        4593 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    5792             :     {
    5793         758 :         return (std::isnan(dfNoDataValue) ||
    5794        1515 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    5795         757 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    5796             :                               static_cast<float>(dfNoDataValue), nWidth,
    5797         758 :                               nHeight, nLineStride, nComponents);
    5798             :     }
    5799        3835 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    5800             :     {
    5801        3835 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    5802             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    5803        3835 :                               nComponents);
    5804             :     }
    5805           0 :     return false;
    5806             : }
    5807             : 
    5808             : #ifdef HAVE_SSE2
    5809             : 
    5810             : /************************************************************************/
    5811             : /*                    GDALDeinterleave3Byte()                           */
    5812             : /************************************************************************/
    5813             : 
    5814             : #if defined(__GNUC__) && !defined(__clang__)
    5815             : __attribute__((optimize("no-tree-vectorize")))
    5816             : #endif
    5817             : static void
    5818      322342 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5819             :                       GByte *CPL_RESTRICT pabyDest0,
    5820             :                       GByte *CPL_RESTRICT pabyDest1,
    5821             :                       GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5822             : #ifdef USE_NEON_OPTIMIZATIONS
    5823             : {
    5824             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5825             :                                        nIters);
    5826             : }
    5827             : #else
    5828             : {
    5829             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5830      322342 :     if (CPLHaveRuntimeSSSE3())
    5831             :     {
    5832      322364 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5833      322354 :                                            pabyDest2, nIters);
    5834             :     }
    5835             : #endif
    5836             : 
    5837           2 :     size_t i = 0;
    5838           2 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    5839           2 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    5840           2 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    5841           2 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    5842             :          sizeof(unsigned int)) == 0)
    5843             :     {
    5844             :         // Slightly better than GCC autovectorizer
    5845          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    5846             :         {
    5847          15 :             unsigned int word0 =
    5848          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    5849          15 :             unsigned int word1 =
    5850          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    5851          15 :             unsigned int word2 =
    5852          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    5853          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    5854          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    5855          15 :                 ((word2 >> 8) << 24);
    5856          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    5857          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    5858          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    5859          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    5860          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    5861          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    5862          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    5863             :         }
    5864             :     }
    5865             : #if defined(__clang__)
    5866             : #pragma clang loop vectorize(disable)
    5867             : #endif
    5868           3 :     for (; i < nIters; ++i)
    5869             :     {
    5870           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    5871           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    5872           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    5873             :     }
    5874             : }
    5875             : #endif
    5876             : 
    5877             : /************************************************************************/
    5878             : /*                    GDALDeinterleave4Byte()                           */
    5879             : /************************************************************************/
    5880             : 
    5881             : #if !defined(__GNUC__) || defined(__clang__)
    5882             : 
    5883             : /************************************************************************/
    5884             : /*                         deinterleave()                               */
    5885             : /************************************************************************/
    5886             : 
    5887             : template <bool SHIFT, bool MASK>
    5888             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    5889             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    5890             : {
    5891             :     // Set higher 24bit of each int32 packed word to 0
    5892             :     if (SHIFT)
    5893             :     {
    5894             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    5895             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    5896             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    5897             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    5898             :     }
    5899             :     __m128i xmm0;
    5900             :     __m128i xmm1;
    5901             :     __m128i xmm2;
    5902             :     __m128i xmm3;
    5903             :     if (MASK)
    5904             :     {
    5905             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    5906             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    5907             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    5908             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    5909             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    5910             :     }
    5911             :     else
    5912             :     {
    5913             :         xmm0 = xmm0_ori;
    5914             :         xmm1 = xmm1_ori;
    5915             :         xmm2 = xmm2_ori;
    5916             :         xmm3 = xmm3_ori;
    5917             :     }
    5918             :     // Pack int32 to int16
    5919             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    5920             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    5921             :     // Pack int16 to uint8
    5922             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    5923             :     return xmm0;
    5924             : }
    5925             : 
    5926             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5927             :                                   GByte *CPL_RESTRICT pabyDest0,
    5928             :                                   GByte *CPL_RESTRICT pabyDest1,
    5929             :                                   GByte *CPL_RESTRICT pabyDest2,
    5930             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5931             : #ifdef USE_NEON_OPTIMIZATIONS
    5932             : {
    5933             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5934             :                                        pabyDest3, nIters);
    5935             : }
    5936             : #else
    5937             : {
    5938             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5939             :     if (CPLHaveRuntimeSSSE3())
    5940             :     {
    5941             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5942             :                                            pabyDest2, pabyDest3, nIters);
    5943             :     }
    5944             : #endif
    5945             : 
    5946             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    5947             :     // do something slightly better.
    5948             :     size_t i = 0;
    5949             :     for (; i + 15 < nIters; i += 16)
    5950             :     {
    5951             :         __m128i xmm0_ori = _mm_loadu_si128(
    5952             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    5953             :         __m128i xmm1_ori = _mm_loadu_si128(
    5954             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    5955             :         __m128i xmm2_ori = _mm_loadu_si128(
    5956             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    5957             :         __m128i xmm3_ori = _mm_loadu_si128(
    5958             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    5959             : 
    5960             :         _mm_storeu_si128(
    5961             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    5962             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5963             :         _mm_storeu_si128(
    5964             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    5965             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5966             :         _mm_storeu_si128(
    5967             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    5968             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5969             :         _mm_storeu_si128(
    5970             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    5971             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5972             :     }
    5973             : 
    5974             : #if defined(__clang__)
    5975             : #pragma clang loop vectorize(disable)
    5976             : #endif
    5977             :     for (; i < nIters; ++i)
    5978             :     {
    5979             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5980             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5981             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5982             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5983             :     }
    5984             : }
    5985             : #endif
    5986             : #else
    5987             : // GCC autovectorizer does an excellent job
    5988       62366 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    5989             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    5990             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    5991             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5992             : {
    5993   528825000 :     for (size_t i = 0; i < nIters; ++i)
    5994             :     {
    5995   528763000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    5996   528763000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    5997   528763000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    5998   528763000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    5999             :     }
    6000       62366 : }
    6001             : #endif
    6002             : 
    6003             : #else
    6004             : 
    6005             : /************************************************************************/
    6006             : /*                    GDALDeinterleave3Byte()                           */
    6007             : /************************************************************************/
    6008             : 
    6009             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    6010             : // how to auto-vectorize
    6011             : // #if defined(__GNUC__)
    6012             : //__attribute__((optimize("tree-vectorize")))
    6013             : // #endif
    6014             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6015             :                                   GByte *CPL_RESTRICT pabyDest0,
    6016             :                                   GByte *CPL_RESTRICT pabyDest1,
    6017             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6018             : {
    6019             :     for (size_t i = 0; i < nIters; ++i)
    6020             :     {
    6021             :         pabyDest0[i] = pabySrc[3 * i + 0];
    6022             :         pabyDest1[i] = pabySrc[3 * i + 1];
    6023             :         pabyDest2[i] = pabySrc[3 * i + 2];
    6024             :     }
    6025             : }
    6026             : 
    6027             : /************************************************************************/
    6028             : /*                    GDALDeinterleave4Byte()                           */
    6029             : /************************************************************************/
    6030             : 
    6031             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    6032             : // how to auto-vectorize
    6033             : // #if defined(__GNUC__)
    6034             : //__attribute__((optimize("tree-vectorize")))
    6035             : // #endif
    6036             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6037             :                                   GByte *CPL_RESTRICT pabyDest0,
    6038             :                                   GByte *CPL_RESTRICT pabyDest1,
    6039             :                                   GByte *CPL_RESTRICT pabyDest2,
    6040             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6041             : {
    6042             :     for (size_t i = 0; i < nIters; ++i)
    6043             :     {
    6044             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6045             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6046             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6047             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6048             :     }
    6049             : }
    6050             : 
    6051             : #endif
    6052             : 
    6053             : /************************************************************************/
    6054             : /*                      GDALDeinterleave()                              */
    6055             : /************************************************************************/
    6056             : 
    6057             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    6058             :     buffers.
    6059             : 
    6060             :     In pseudo-code
    6061             :     \verbatim
    6062             :     for(size_t i = 0; i < nIters; ++i)
    6063             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    6064             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    6065             :     \endverbatim
    6066             : 
    6067             :     The implementation is optimized for a few cases, like de-interleaving
    6068             :     of 3 or 4-components Byte buffers.
    6069             : 
    6070             :     \since GDAL 3.6
    6071             :  */
    6072      385065 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    6073             :                       int nComponents, void **ppDestBuffer,
    6074             :                       GDALDataType eDestDT, size_t nIters)
    6075             : {
    6076      385065 :     if (eSourceDT == eDestDT)
    6077             :     {
    6078      385042 :         if (eSourceDT == GDT_Byte || eSourceDT == GDT_Int8)
    6079             :         {
    6080      384721 :             if (nComponents == 3)
    6081             :             {
    6082      322321 :                 const GByte *CPL_RESTRICT pabySrc =
    6083             :                     static_cast<const GByte *>(pSourceBuffer);
    6084      322321 :                 GByte *CPL_RESTRICT pabyDest0 =
    6085             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6086      322321 :                 GByte *CPL_RESTRICT pabyDest1 =
    6087             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6088      322321 :                 GByte *CPL_RESTRICT pabyDest2 =
    6089             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6090      322321 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6091             :                                       nIters);
    6092      322363 :                 return;
    6093             :             }
    6094       62400 :             else if (nComponents == 4)
    6095             :             {
    6096       62366 :                 const GByte *CPL_RESTRICT pabySrc =
    6097             :                     static_cast<const GByte *>(pSourceBuffer);
    6098       62366 :                 GByte *CPL_RESTRICT pabyDest0 =
    6099             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6100       62366 :                 GByte *CPL_RESTRICT pabyDest1 =
    6101             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6102       62366 :                 GByte *CPL_RESTRICT pabyDest2 =
    6103             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6104       62366 :                 GByte *CPL_RESTRICT pabyDest3 =
    6105             :                     static_cast<GByte *>(ppDestBuffer[3]);
    6106       62366 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6107             :                                       pabyDest3, nIters);
    6108       62364 :                 return;
    6109          34 :             }
    6110             :         }
    6111             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    6112             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    6113             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    6114         642 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    6115         321 :                  CPLHaveRuntimeSSSE3())
    6116             :         {
    6117         321 :             if (nComponents == 3)
    6118             :             {
    6119         126 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6120             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6121         126 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6122             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6123         126 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6124             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6125         126 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6126             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6127         126 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    6128             :                                               panDest2, nIters);
    6129         126 :                 return;
    6130             :             }
    6131             : #if !defined(__INTEL_CLANG_COMPILER)
    6132             :             // ICC autovectorizer doesn't do a good job, at least with icx
    6133             :             // 2022.1.0.20220316
    6134         195 :             else if (nComponents == 4)
    6135             :             {
    6136         195 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6137             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6138         195 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6139             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6140         195 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6141             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6142         195 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6143             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6144         195 :                 GUInt16 *CPL_RESTRICT panDest3 =
    6145             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    6146         195 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    6147             :                                               panDest2, panDest3, nIters);
    6148         195 :                 return;
    6149             :             }
    6150             : #endif
    6151             :         }
    6152             : #endif
    6153             :     }
    6154             : 
    6155          57 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    6156          29 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    6157         108 :     for (int iComp = 0; iComp < nComponents; iComp++)
    6158             :     {
    6159          79 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    6160          79 :                             iComp * nSourceDTSize,
    6161             :                         eSourceDT, nComponents * nSourceDTSize,
    6162          79 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    6163             :     }
    6164             : }
    6165             : 
    6166             : /************************************************************************/
    6167             : /*                    GDALTranspose2DSingleToSingle()                   */
    6168             : /************************************************************************/
    6169             : /**
    6170             :  * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
    6171             :  *
    6172             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6173             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6174             :  * @param nSrcWidth Width of pSrc array.
    6175             :  * @param nSrcHeight Height of pSrc array.
    6176             :  */
    6177             : 
    6178             : template <class DST, class SRC>
    6179         158 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
    6180             :                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6181             :                                    size_t nSrcHeight)
    6182             : {
    6183         158 :     constexpr size_t blocksize = 32;
    6184         341 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6185             :     {
    6186         183 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6187        4976 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6188             :         {
    6189             :             // transpose the block beginning at [i,j]
    6190        4793 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6191       25387 :             for (size_t k = i; k < max_k; ++k)
    6192             :             {
    6193      644522 :                 for (size_t l = j; l < max_l; ++l)
    6194             :                 {
    6195      623928 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6196      623928 :                                  pDst[k + l * nSrcHeight]);
    6197             :                 }
    6198             :             }
    6199             :         }
    6200             :     }
    6201         158 : }
    6202             : 
    6203             : /************************************************************************/
    6204             : /*                   GDALTranspose2DComplexToComplex()                  */
    6205             : /************************************************************************/
    6206             : /**
    6207             :  * Transpose a 2D array of complex values into an array of complex values,
    6208             :  * in a efficient (cache-oblivious) way.
    6209             :  *
    6210             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6211             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6212             :  * @param nSrcWidth Width of pSrc array.
    6213             :  * @param nSrcHeight Height of pSrc array.
    6214             :  */
    6215             : template <class DST, class SRC>
    6216          25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
    6217             :                                      DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6218             :                                      size_t nSrcHeight)
    6219             : {
    6220          25 :     constexpr size_t blocksize = 32;
    6221          50 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6222             :     {
    6223          25 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6224          50 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6225             :         {
    6226             :             // transpose the block beginning at [i,j]
    6227          25 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6228          75 :             for (size_t k = i; k < max_k; ++k)
    6229             :             {
    6230         200 :                 for (size_t l = j; l < max_l; ++l)
    6231             :                 {
    6232         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6233         150 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6234         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
    6235         150 :                                  pDst[2 * (k + l * nSrcHeight) + 1]);
    6236             :                 }
    6237             :             }
    6238             :         }
    6239             :     }
    6240          25 : }
    6241             : 
    6242             : /************************************************************************/
    6243             : /*                   GDALTranspose2DComplexToSingle()                  */
    6244             : /************************************************************************/
    6245             : /**
    6246             :  * Transpose a 2D array of complex values into an array of non-complex values,
    6247             :  * in a efficient (cache-oblivious) way.
    6248             :  *
    6249             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6250             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6251             :  * @param nSrcWidth Width of pSrc array.
    6252             :  * @param nSrcHeight Height of pSrc array.
    6253             :  */
    6254             : template <class DST, class SRC>
    6255          55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
    6256             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6257             :                                     size_t nSrcHeight)
    6258             : {
    6259          55 :     constexpr size_t blocksize = 32;
    6260         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6261             :     {
    6262          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6263         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6264             :         {
    6265             :             // transpose the block beginning at [i,j]
    6266          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6267         165 :             for (size_t k = i; k < max_k; ++k)
    6268             :             {
    6269         440 :                 for (size_t l = j; l < max_l; ++l)
    6270             :                 {
    6271         330 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6272         330 :                                  pDst[k + l * nSrcHeight]);
    6273             :                 }
    6274             :             }
    6275             :         }
    6276             :     }
    6277          55 : }
    6278             : 
    6279             : /************************************************************************/
    6280             : /*                   GDALTranspose2DSingleToComplex()                  */
    6281             : /************************************************************************/
    6282             : /**
    6283             :  * Transpose a 2D array of non-complex values into an array of complex values,
    6284             :  * in a efficient (cache-oblivious) way.
    6285             :  *
    6286             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6287             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6288             :  * @param nSrcWidth Width of pSrc array.
    6289             :  * @param nSrcHeight Height of pSrc array.
    6290             :  */
    6291             : template <class DST, class SRC>
    6292          55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
    6293             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6294             :                                     size_t nSrcHeight)
    6295             : {
    6296          55 :     constexpr size_t blocksize = 32;
    6297         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6298             :     {
    6299          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6300         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6301             :         {
    6302             :             // transpose the block beginning at [i,j]
    6303          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6304         165 :             for (size_t k = i; k < max_k; ++k)
    6305             :             {
    6306         440 :                 for (size_t l = j; l < max_l; ++l)
    6307             :                 {
    6308         330 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6309         330 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6310         330 :                     pDst[2 * (k + l * nSrcHeight) + 1] = 0;
    6311             :                 }
    6312             :             }
    6313             :         }
    6314             :     }
    6315          55 : }
    6316             : 
    6317             : /************************************************************************/
    6318             : /*                        GDALTranspose2D()                             */
    6319             : /************************************************************************/
    6320             : 
    6321             : template <class DST, bool DST_IS_COMPLEX>
    6322         293 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
    6323             :                             size_t nSrcWidth, size_t nSrcHeight)
    6324             : {
    6325             : #define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
    6326             :     do                                                                         \
    6327             :     {                                                                          \
    6328             :         if constexpr (DST_IS_COMPLEX)                                          \
    6329             :         {                                                                      \
    6330             :             GDALTranspose2DSingleToComplex(                                    \
    6331             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6332             :                 nSrcHeight);                                                   \
    6333             :         }                                                                      \
    6334             :         else                                                                   \
    6335             :         {                                                                      \
    6336             :             GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
    6337             :                                           pDst, nSrcWidth, nSrcHeight);        \
    6338             :         }                                                                      \
    6339             :     } while (0)
    6340             : 
    6341             : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
    6342             :     do                                                                         \
    6343             :     {                                                                          \
    6344             :         if constexpr (DST_IS_COMPLEX)                                          \
    6345             :         {                                                                      \
    6346             :             GDALTranspose2DComplexToComplex(                                   \
    6347             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6348             :                 nSrcHeight);                                                   \
    6349             :         }                                                                      \
    6350             :         else                                                                   \
    6351             :         {                                                                      \
    6352             :             GDALTranspose2DComplexToSingle(                                    \
    6353             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6354             :                 nSrcHeight);                                                   \
    6355             :         }                                                                      \
    6356             :     } while (0)
    6357             : 
    6358             :     // clang-format off
    6359         293 :     switch (eSrcType)
    6360             :     {
    6361          16 :         case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t); break;
    6362          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
    6363          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
    6364          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
    6365          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
    6366          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
    6367          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
    6368          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
    6369          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
    6370          17 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
    6371          24 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
    6372          16 :         case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
    6373          16 :         case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
    6374          16 :         case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
    6375          16 :         case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
    6376          16 :         case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
    6377           0 :         case GDT_Unknown:
    6378             :         case GDT_TypeCount:
    6379           0 :             break;
    6380             :     }
    6381             :         // clang-format on
    6382             : 
    6383             : #undef CALL_GDALTranspose2D_internal
    6384             : #undef CALL_GDALTranspose2DComplex_internal
    6385         293 : }
    6386             : 
    6387             : /************************************************************************/
    6388             : /*                      GDALInterleave2Byte()                           */
    6389             : /************************************************************************/
    6390             : 
    6391             : #if defined(HAVE_SSE2) &&                                                      \
    6392             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6393             : 
    6394             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6395             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6396             : #if defined(__GNUC__)
    6397             : __attribute__((noinline))
    6398             : #endif
    6399             : static void
    6400             : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6401             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6402             : {
    6403             :     size_t i = 0;
    6404             :     constexpr size_t VALS_PER_ITER = 16;
    6405             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6406             :     {
    6407             :         __m128i xmm0 =
    6408             :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
    6409             :         __m128i xmm1 = _mm_loadu_si128(
    6410             :             reinterpret_cast<__m128i const *>(pSrc + i + nIters));
    6411             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
    6412             :                          _mm_unpacklo_epi8(xmm0, xmm1));
    6413             :         _mm_storeu_si128(
    6414             :             reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
    6415             :             _mm_unpackhi_epi8(xmm0, xmm1));
    6416             :     }
    6417             : #if defined(__clang__)
    6418             : #pragma clang loop vectorize(disable)
    6419             : #endif
    6420             :     for (; i < nIters; ++i)
    6421             :     {
    6422             :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6423             :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6424             :     }
    6425             : }
    6426             : 
    6427             : #else
    6428             : 
    6429             : #if defined(__GNUC__) && !defined(__clang__)
    6430             : __attribute__((optimize("tree-vectorize")))
    6431             : #endif
    6432             : #if defined(__GNUC__)
    6433             : __attribute__((noinline))
    6434             : #endif
    6435             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6436             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6437             : #pragma clang diagnostic push
    6438             : #pragma clang diagnostic ignored "-Wpass-failed"
    6439             : #endif
    6440             : static void
    6441           9 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6442             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6443             : {
    6444             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6445             : #pragma clang loop vectorize(enable)
    6446             : #endif
    6447      355429 :     for (size_t i = 0; i < nIters; ++i)
    6448             :     {
    6449      355420 :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6450      355420 :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6451             :     }
    6452           9 : }
    6453             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6454             : #pragma clang diagnostic pop
    6455             : #endif
    6456             : 
    6457             : #endif
    6458             : 
    6459             : /************************************************************************/
    6460             : /*                      GDALInterleave4Byte()                           */
    6461             : /************************************************************************/
    6462             : 
    6463             : #if defined(HAVE_SSE2) &&                                                      \
    6464             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6465             : 
    6466             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6467             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6468             : #if defined(__GNUC__)
    6469             : __attribute__((noinline))
    6470             : #endif
    6471             : static void
    6472             : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6473             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6474             : {
    6475             :     size_t i = 0;
    6476             :     constexpr size_t VALS_PER_ITER = 16;
    6477             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6478             :     {
    6479             :         __m128i xmm0 = _mm_loadu_si128(
    6480             :             reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
    6481             :         __m128i xmm1 = _mm_loadu_si128(
    6482             :             reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
    6483             :         __m128i xmm2 = _mm_loadu_si128(
    6484             :             reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
    6485             :         __m128i xmm3 = _mm_loadu_si128(
    6486             :             reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
    6487             :         auto tmp0 = _mm_unpacklo_epi8(
    6488             :             xmm0,
    6489             :             xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
    6490             :         auto tmp1 = _mm_unpackhi_epi8(
    6491             :             xmm0,
    6492             :             xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
    6493             :         auto tmp2 = _mm_unpacklo_epi8(
    6494             :             xmm2,
    6495             :             xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
    6496             :         auto tmp3 = _mm_unpackhi_epi8(
    6497             :             xmm2,
    6498             :             xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
    6499             :         auto tmp2_0 = _mm_unpacklo_epi16(
    6500             :             tmp0,
    6501             :             tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
    6502             :         auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
    6503             :         auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
    6504             :         auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
    6505             :         _mm_storeu_si128(
    6506             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
    6507             :             tmp2_0);
    6508             :         _mm_storeu_si128(
    6509             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
    6510             :             tmp2_1);
    6511             :         _mm_storeu_si128(
    6512             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
    6513             :             tmp2_2);
    6514             :         _mm_storeu_si128(
    6515             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
    6516             :             tmp2_3);
    6517             :     }
    6518             : #if defined(__clang__)
    6519             : #pragma clang loop vectorize(disable)
    6520             : #endif
    6521             :     for (; i < nIters; ++i)
    6522             :     {
    6523             :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    6524             :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    6525             :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    6526             :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    6527             :     }
    6528             : }
    6529             : 
    6530             : #else
    6531             : 
    6532             : #if defined(__GNUC__) && !defined(__clang__)
    6533             : __attribute__((optimize("tree-vectorize")))
    6534             : #endif
    6535             : #if defined(__GNUC__)
    6536             : __attribute__((noinline))
    6537             : #endif
    6538             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6539             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6540             : #pragma clang diagnostic push
    6541             : #pragma clang diagnostic ignored "-Wpass-failed"
    6542             : #endif
    6543             : static void
    6544           9 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6545             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6546             : {
    6547             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6548             : #pragma clang loop vectorize(enable)
    6549             : #endif
    6550       75443 :     for (size_t i = 0; i < nIters; ++i)
    6551             :     {
    6552       75434 :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    6553       75434 :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    6554       75434 :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    6555       75434 :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    6556             :     }
    6557           9 : }
    6558             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6559             : #pragma clang diagnostic pop
    6560             : #endif
    6561             : 
    6562             : #endif
    6563             : 
    6564             : /************************************************************************/
    6565             : /*                        GDALTranspose2D()                             */
    6566             : /************************************************************************/
    6567             : 
    6568             : /**
    6569             :  * Transpose a 2D array in a efficient (cache-oblivious) way.
    6570             :  *
    6571             :  * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
    6572             :  * @param eSrcType Data type of pSrc.
    6573             :  * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
    6574             :  * @param eDstType Data type of pDst.
    6575             :  * @param nSrcWidth Width of pSrc array.
    6576             :  * @param nSrcHeight Height of pSrc array.
    6577             :  * @since GDAL 3.11
    6578             :  */
    6579             : 
    6580         344 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
    6581             :                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
    6582             : {
    6583         344 :     if (eSrcType == eDstType && (eSrcType == GDT_Byte || eSrcType == GDT_Int8))
    6584             :     {
    6585          51 :         if (nSrcHeight == 2)
    6586             :         {
    6587           9 :             GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
    6588             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    6589           9 :             return;
    6590             :         }
    6591          42 :         if (nSrcHeight == 4)
    6592             :         {
    6593           9 :             GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
    6594             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    6595           9 :             return;
    6596             :         }
    6597             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
    6598             :      (defined(__x86_64) || defined(_M_X64)))
    6599          33 :         if (CPLHaveRuntimeSSSE3())
    6600             :         {
    6601          33 :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    6602             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    6603             :                                        nSrcHeight);
    6604          33 :             return;
    6605             :         }
    6606             : #elif defined(USE_NEON_OPTIMIZATIONS)
    6607             :         {
    6608             :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    6609             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    6610             :                                        nSrcHeight);
    6611             :             return;
    6612             :         }
    6613             : #endif
    6614             :     }
    6615             : 
    6616             : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
    6617             :     GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
    6618             :         pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
    6619             : 
    6620             :     // clang-format off
    6621         293 :     switch (eDstType)
    6622             :     {
    6623          15 :         case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
    6624          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
    6625          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
    6626          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
    6627          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
    6628          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
    6629          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
    6630          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
    6631          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
    6632          17 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
    6633          25 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
    6634          16 :         case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
    6635          16 :         case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
    6636          16 :         case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
    6637          16 :         case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
    6638          16 :         case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
    6639           0 :         case GDT_Unknown:
    6640             :         case GDT_TypeCount:
    6641           0 :             break;
    6642             :     }
    6643             :         // clang-format on
    6644             : 
    6645             : #undef CALL_GDALTranspose2D_internal
    6646             : }
    6647             : 
    6648             : /************************************************************************/
    6649             : /*                     ExtractBitAndConvertTo255()                      */
    6650             : /************************************************************************/
    6651             : 
    6652             : #if defined(__GNUC__) || defined(_MSC_VER)
    6653             : // Signedness of char implementation dependent, so be explicit.
    6654             : // Assumes 2-complement integer types and sign extension of right shifting
    6655             : // GCC guarantees such:
    6656             : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
    6657      124890 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    6658             : {
    6659      124890 :     return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
    6660      124890 :                               7);
    6661             : }
    6662             : #else
    6663             : // Portable way
    6664             : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    6665             : {
    6666             :     return (byVal & (1 << nBit)) ? 255 : 0;
    6667             : }
    6668             : #endif
    6669             : 
    6670             : /************************************************************************/
    6671             : /*                   ExpandEightPackedBitsToByteAt255()                 */
    6672             : /************************************************************************/
    6673             : 
    6674       15569 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
    6675             :                                                     GByte abyOutput[8])
    6676             : {
    6677       15569 :     abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
    6678       15569 :     abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
    6679       15569 :     abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
    6680       15569 :     abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
    6681       15569 :     abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
    6682       15569 :     abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
    6683       15569 :     abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
    6684       15569 :     abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
    6685       15569 : }
    6686             : 
    6687             : /************************************************************************/
    6688             : /*                GDALExpandPackedBitsToByteAt0Or255()                  */
    6689             : /************************************************************************/
    6690             : 
    6691             : /** Expand packed-bits (ordered from most-significant bit to least one)
    6692             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    6693             :   at 1 to a byte at 255.
    6694             : 
    6695             :  The function does (in a possibly more optimized way) the following:
    6696             :  \code{.cpp}
    6697             :  for (size_t i = 0; i < nInputBits; ++i )
    6698             :  {
    6699             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
    6700             :  }
    6701             :  \endcode
    6702             : 
    6703             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    6704             :  @param pabyOutput Output array of nInputBits bytes.
    6705             :  @param nInputBits Number of valid bits in pabyInput.
    6706             : 
    6707             :  @since 3.11
    6708             : */
    6709             : 
    6710       45145 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
    6711             :                                         GByte *CPL_RESTRICT pabyOutput,
    6712             :                                         size_t nInputBits)
    6713             : {
    6714       45145 :     const size_t nInputWholeBytes = nInputBits / 8;
    6715       45145 :     size_t iByte = 0;
    6716             : 
    6717             : #ifdef HAVE_SSE2
    6718             :     // Mask to isolate each bit
    6719       45145 :     const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
    6720             :                                           8, 16, 32, 64, -128);
    6721       45145 :     const __m128i zero = _mm_setzero_si128();
    6722       45145 :     const __m128i all_ones = _mm_set1_epi8(-1);
    6723             : #ifdef __SSSE3__
    6724             :     const __m128i dispatch_two_bytes =
    6725             :         _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
    6726             : #endif
    6727       45145 :     constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
    6728      135654 :     for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
    6729             :     {
    6730       90509 :         __m128i reg_ori = _mm_loadu_si128(
    6731       90509 :             reinterpret_cast<const __m128i *>(pabyInput + iByte));
    6732             : 
    6733       90509 :         constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
    6734      814581 :         for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
    6735             :         {
    6736             :             // Given reg_ori = (A, B, ... 14 other bytes ...),
    6737             :             // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
    6738             : #ifdef __SSSE3__
    6739             :             __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
    6740             : #else
    6741      724072 :             __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
    6742      724072 :             reg = _mm_unpacklo_epi16(reg, reg);
    6743      724072 :             reg = _mm_unpacklo_epi32(reg, reg);
    6744             : #endif
    6745             : 
    6746             :             // Test if bits of interest are set
    6747      724072 :             reg = _mm_and_si128(reg, bit_mask);
    6748             : 
    6749             :             // Now test if those bits are set, by comparing to zero. So the
    6750             :             // result will be that bytes where bits are set will be at 0, and
    6751             :             // ones where they are cleared will be at 0xFF. So the inverse of
    6752             :             // the end result we want!
    6753      724072 :             reg = _mm_cmpeq_epi8(reg, zero);
    6754             : 
    6755             :             // Invert the result
    6756      724072 :             reg = _mm_andnot_si128(reg, all_ones);
    6757             : 
    6758             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
    6759             : 
    6760      724072 :             pabyOutput += SSE_REG_SIZE;
    6761             : 
    6762             :             // Right-shift of 2 bytes
    6763      724072 :             reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
    6764             :         }
    6765             :     }
    6766             : 
    6767             : #endif  // HAVE_SSE2
    6768             : 
    6769       60714 :     for (; iByte < nInputWholeBytes; ++iByte)
    6770             :     {
    6771       15569 :         ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
    6772       15569 :         pabyOutput += 8;
    6773             :     }
    6774       45483 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    6775             :     {
    6776         338 :         *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
    6777         338 :         ++pabyOutput;
    6778             :     }
    6779       45145 : }
    6780             : 
    6781             : /************************************************************************/
    6782             : /*                   ExpandEightPackedBitsToByteAt1()                   */
    6783             : /************************************************************************/
    6784             : 
    6785      136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
    6786             :                                                   GByte abyOutput[8])
    6787             : {
    6788      136113 :     abyOutput[0] = (byVal >> 7) & 0x1;
    6789      136113 :     abyOutput[1] = (byVal >> 6) & 0x1;
    6790      136113 :     abyOutput[2] = (byVal >> 5) & 0x1;
    6791      136113 :     abyOutput[3] = (byVal >> 4) & 0x1;
    6792      136113 :     abyOutput[4] = (byVal >> 3) & 0x1;
    6793      136113 :     abyOutput[5] = (byVal >> 2) & 0x1;
    6794      136113 :     abyOutput[6] = (byVal >> 1) & 0x1;
    6795      136113 :     abyOutput[7] = (byVal >> 0) & 0x1;
    6796      136113 : }
    6797             : 
    6798             : /************************************************************************/
    6799             : /*                GDALExpandPackedBitsToByteAt0Or1()                    */
    6800             : /************************************************************************/
    6801             : 
    6802             : /** Expand packed-bits (ordered from most-significant bit to least one)
    6803             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    6804             :   at 1 to a byte at 1.
    6805             : 
    6806             :  The function does (in a possibly more optimized way) the following:
    6807             :  \code{.cpp}
    6808             :  for (size_t i = 0; i < nInputBits; ++i )
    6809             :  {
    6810             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
    6811             :  }
    6812             :  \endcode
    6813             : 
    6814             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    6815             :  @param pabyOutput Output array of nInputBits bytes.
    6816             :  @param nInputBits Number of valid bits in pabyInput.
    6817             : 
    6818             :  @since 3.11
    6819             : */
    6820             : 
    6821        7041 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
    6822             :                                       GByte *CPL_RESTRICT pabyOutput,
    6823             :                                       size_t nInputBits)
    6824             : {
    6825        7041 :     const size_t nInputWholeBytes = nInputBits / 8;
    6826        7041 :     size_t iByte = 0;
    6827      143154 :     for (; iByte < nInputWholeBytes; ++iByte)
    6828             :     {
    6829      136113 :         ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
    6830      136113 :         pabyOutput += 8;
    6831             :     }
    6832       18902 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    6833             :     {
    6834       11861 :         *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
    6835       11861 :         ++pabyOutput;
    6836             :     }
    6837        7041 : }

Generated by: LCOV version 1.14