LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2680 2903 92.3 %
Date: 2026-01-09 20:32:01 Functions: 687 728 94.4 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_float.h"
      36             : #include "cpl_progress.h"
      37             : #include "cpl_string.h"
      38             : #include "cpl_vsi.h"
      39             : #include "gdal_priv_templates.hpp"
      40             : #include "gdal_vrt.h"
      41             : #include "gdalwarper.h"
      42             : #include "memdataset.h"
      43             : #include "vrtdataset.h"
      44             : 
      45             : #if defined(__x86_64) || defined(_M_X64)
      46             : #include <emmintrin.h>
      47             : #define HAVE_SSE2
      48             : #elif defined(USE_NEON_OPTIMIZATIONS)
      49             : #include "include_sse2neon.h"
      50             : #define HAVE_SSE2
      51             : #endif
      52             : 
      53             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      54             : #include "rasterio_ssse3.h"
      55             : #ifdef __SSSE3__
      56             : #include <tmmintrin.h>
      57             : #endif
      58             : #endif
      59             : 
      60             : #ifdef __SSE4_1__
      61             : #include <smmintrin.h>
      62             : #endif
      63             : 
      64             : #ifdef __GNUC__
      65             : #define CPL_NOINLINE __attribute__((noinline))
      66             : #else
      67             : #define CPL_NOINLINE
      68             : #endif
      69             : 
      70             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      71             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      72             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      73             : 
      74             : /************************************************************************/
      75             : /*                    DownsamplingIntegerXFactor()                      */
      76             : /************************************************************************/
      77             : 
      78             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      79      695780 : static bool DownsamplingIntegerXFactor(
      80             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      81             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      82             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      83             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      84             : {
      85      695780 :     const int nBandDataSize =
      86             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      87      695780 :     int nOuterLoopIters = nBufXSize - 1;
      88      695780 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      89             :     const GByte *CPL_RESTRICT pabySrcData;
      90      695780 :     int nEndBlockX = nBlockXSize + nStartBlockX;
      91             : 
      92      695780 :     if (iSrcX < nEndBlockX)
      93             :     {
      94      294999 :         CPLAssert(poBlock);
      95      294999 :         goto no_reload_block;
      96             :     }
      97      400781 :     goto reload_block;
      98             : 
      99             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
     100             :     // nRasterXSize - 1
     101     1264973 :     while (--nOuterLoopIters >= 1)
     102             :     {
     103      201834 :         iSrcX += nSrcXInc;
     104      201834 :         pabySrcData += nIncSrcOffset;
     105      201834 :         pabyDstData += nPixelSpace;
     106             : 
     107             :         /* --------------------------------------------------------------------
     108             :          */
     109             :         /*      Ensure we have the appropriate block loaded. */
     110             :         /* --------------------------------------------------------------------
     111             :          */
     112      201834 :         if (iSrcX >= nEndBlockX)
     113             :         {
     114      201834 :         reload_block:
     115             :         {
     116      615205 :             const int nLBlockX = iSrcX / nBlockXSize;
     117      615205 :             nStartBlockX = nLBlockX * nBlockXSize;
     118      615205 :             nEndBlockX = nStartBlockX + nBlockXSize;
     119             : 
     120      615205 :             if (poBlock != nullptr)
     121      341376 :                 poBlock->DropLock();
     122             : 
     123      615205 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     124      615205 :             if (poBlock == nullptr)
     125             :             {
     126           1 :                 return false;
     127             :             }
     128             :         }
     129             : 
     130      615204 :         no_reload_block:
     131             :             const GByte *pabySrcBlock =
     132     1264973 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     133     1264973 :             GPtrDiff_t iSrcOffset =
     134     1264973 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     135     1264973 :             pabySrcData = pabySrcBlock + iSrcOffset;
     136             :         }
     137             : 
     138             :         /* --------------------------------------------------------------------
     139             :          */
     140             :         /*      Copy the maximum run of pixels. */
     141             :         /* --------------------------------------------------------------------
     142             :          */
     143             : 
     144     1264973 :         const int nIters = std::min(
     145     1264973 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     146             :         if (bSameDataType)
     147             :         {
     148     1264530 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     149     1264530 :             if (nIters > 1)
     150             :             {
     151             :                 if (DATA_TYPE_SIZE == 1)
     152             :                 {
     153      326250 :                     pabySrcData += nIncSrcOffset;
     154      326250 :                     pabyDstData += nPixelSpace;
     155      326250 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     156      326250 :                                      nPixelSpace, nIters - 1);
     157      326250 :                     pabySrcData +=
     158      326250 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     159      326250 :                     pabyDstData +=
     160      326250 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     161             :                 }
     162             :                 else
     163             :                 {
     164     4395716 :                     for (int i = 0; i < nIters - 1; i++)
     165             :                     {
     166     4197550 :                         pabySrcData += nIncSrcOffset;
     167     4197550 :                         pabyDstData += nPixelSpace;
     168     4197550 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     169             :                     }
     170             :                 }
     171      524420 :                 iSrcX += nSrcXInc * (nIters - 1);
     172      524420 :                 nOuterLoopIters -= nIters - 1;
     173             :             }
     174             :         }
     175             :         else
     176             :         {
     177             :             // Type to type conversion ...
     178         443 :             GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     179         443 :                             eBufType, nPixelSpace, std::max(1, nIters));
     180         443 :             if (nIters > 1)
     181             :             {
     182         216 :                 pabySrcData +=
     183         216 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     184         216 :                 pabyDstData +=
     185         216 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     186         216 :                 iSrcX += nSrcXInc * (nIters - 1);
     187         216 :                 nOuterLoopIters -= nIters - 1;
     188             :             }
     189             :         }
     190             :     }
     191             : 
     192             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     193     1063139 :     if (nOuterLoopIters == 0)
     194             :     {
     195      367360 :         const int nRasterXSize = poBand->GetXSize();
     196      367360 :         iSrcX =
     197      734720 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     198      367360 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     199      367360 :         pabyDstData += nPixelSpace;
     200      367360 :         if (iSrcX < nEndBlockX)
     201             :         {
     202      354770 :             goto no_reload_block;
     203             :         }
     204       12590 :         goto reload_block;
     205             :     }
     206      695779 :     return true;
     207             : }
     208             : 
     209             : template <class A, class B>
     210     2732000 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
     211             : {
     212     2732000 :     return a * b;
     213             : }
     214             : 
     215             : /************************************************************************/
     216             : /*                             IRasterIO()                              */
     217             : /*                                                                      */
     218             : /*      Default internal implementation of RasterIO() ... utilizes      */
     219             : /*      the Block access methods to satisfy the request.  This would    */
     220             : /*      normally only be overridden by formats with overviews.          */
     221             : /************************************************************************/
     222             : 
     223     6119250 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     224             :                                  int nXSize, int nYSize, void *pData,
     225             :                                  int nBufXSize, int nBufYSize,
     226             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     227             :                                  GSpacing nLineSpace,
     228             :                                  GDALRasterIOExtraArg *psExtraArg)
     229             : 
     230             : {
     231     6119250 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     232             :     {
     233           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     234             :                  "An error occurred while writing a dirty block "
     235             :                  "from GDALRasterBand::IRasterIO");
     236           0 :         CPLErr eErr = eFlushBlockErr;
     237           0 :         eFlushBlockErr = CE_None;
     238           0 :         return eErr;
     239             :     }
     240     6119250 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     241             :     {
     242           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     243           0 :         return CE_Failure;
     244             :     }
     245             : 
     246     6119250 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     247     6119250 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     248     6119250 :     GByte dummyBlock[2] = {0, 0};
     249     6119250 :     GByte *pabySrcBlock =
     250             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     251     6119250 :     GDALRasterBlock *poBlock = nullptr;
     252     6119250 :     const bool bUseIntegerRequestCoords =
     253     6466460 :         (!psExtraArg->bFloatingPointWindowValidity ||
     254      347211 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     255      323819 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     256             : 
     257             :     /* ==================================================================== */
     258             :     /*      A common case is the data requested with the destination        */
     259             :     /*      is packed, and the block width is the raster width.             */
     260             :     /* ==================================================================== */
     261     6041750 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     262     3191810 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     263    12161000 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     264             :     {
     265     3079270 :         CPLErr eErr = CE_None;
     266     3079270 :         int nLBlockY = -1;
     267             : 
     268     9621000 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     269             :         {
     270     6542810 :             const int iSrcY = iBufYOff + nYOff;
     271             : 
     272     6542810 :             if (iSrcY < nLBlockY * nBlockYSize ||
     273     6542810 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     274             :             {
     275     3339130 :                 nLBlockY = iSrcY / nBlockYSize;
     276     3339130 :                 bool bJustInitialize =
     277      295478 :                     eRWFlag == GF_Write && nXOff == 0 &&
     278     3691750 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     279       57137 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     280             : 
     281             :                 // Is this a partial tile at right and/or bottom edges of
     282             :                 // the raster, and that is going to be completely written?
     283             :                 // If so, do not load it from storage, but zero it so that
     284             :                 // the content outsize of the validity area is initialized.
     285     3339130 :                 bool bMemZeroBuffer = false;
     286      295478 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     287       23861 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     288     3634700 :                     nYOff + nYSize == GetYSize() &&
     289          89 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     290             :                 {
     291          89 :                     bJustInitialize = true;
     292          89 :                     bMemZeroBuffer = true;
     293             :                 }
     294             : 
     295     3339130 :                 if (poBlock)
     296      259858 :                     poBlock->DropLock();
     297             : 
     298     3339130 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     299     3339130 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     300     3339130 :                 if (poBlock == nullptr)
     301             :                 {
     302        1079 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     303             :                         nullptr)
     304             :                     {
     305           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     306             :                                  "GetBlockRef failed at X block offset %d, "
     307             :                                  "Y block offset %d%s",
     308             :                                  0, nLBlockY,
     309           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     310           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     311             :                                      : "");
     312             :                     }
     313        1079 :                     eErr = CE_Failure;
     314        1079 :                     break;
     315             :                 }
     316             : 
     317     3338050 :                 if (eRWFlag == GF_Write)
     318      295478 :                     poBlock->MarkDirty();
     319             : 
     320     3338050 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     321     3338050 :                 if (bMemZeroBuffer)
     322             :                 {
     323          89 :                     memset(pabySrcBlock, 0,
     324          89 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     325          89 :                                nBlockXSize * nBlockYSize);
     326             :                 }
     327             :             }
     328             : 
     329     6541730 :             const auto nSrcByteOffset =
     330     6541730 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     331     6541730 :                      nBlockXSize +
     332     6541730 :                  nXOff) *
     333     6541730 :                 nBandDataSize;
     334             : 
     335     6541730 :             if (eDataType == eBufType)
     336             :             {
     337     2893900 :                 if (eRWFlag == GF_Read)
     338     2423290 :                     memcpy(static_cast<GByte *>(pData) +
     339     2423290 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     340     2423290 :                            pabySrcBlock + nSrcByteOffset,
     341             :                            static_cast<size_t>(nLineSpace));
     342             :                 else
     343      470615 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     344      470615 :                            static_cast<GByte *>(pData) +
     345      470615 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     346             :                            static_cast<size_t>(nLineSpace));
     347             :             }
     348             :             else
     349             :             {
     350             :                 // Type to type conversion.
     351     3647820 :                 if (eRWFlag == GF_Read)
     352     3626220 :                     GDALCopyWords64(
     353     3626220 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     354             :                         static_cast<GByte *>(pData) +
     355     3626220 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     356             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     357             :                 else
     358       21603 :                     GDALCopyWords64(static_cast<GByte *>(pData) +
     359       21603 :                                         static_cast<GPtrDiff_t>(iBufYOff) *
     360             :                                             nLineSpace,
     361             :                                     eBufType, static_cast<int>(nPixelSpace),
     362       21603 :                                     pabySrcBlock + nSrcByteOffset, eDataType,
     363             :                                     nBandDataSize, nBufXSize);
     364             :             }
     365             : 
     366     6625330 :             if (psExtraArg->pfnProgress != nullptr &&
     367       83604 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     368             :                                          psExtraArg->pProgressData))
     369             :             {
     370           5 :                 eErr = CE_Failure;
     371           5 :                 break;
     372             :             }
     373             :         }
     374             : 
     375     3079270 :         if (poBlock)
     376     3078190 :             poBlock->DropLock();
     377             : 
     378     3079270 :         return eErr;
     379             :     }
     380             : 
     381             :     /* ==================================================================== */
     382             :     /*      Do we have overviews that would be appropriate to satisfy       */
     383             :     /*      this request?                                                   */
     384             :     /* ==================================================================== */
     385     3039970 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     386             :         eRWFlag == GF_Read)
     387             :     {
     388             :         GDALRasterIOExtraArg sExtraArg;
     389        2967 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     390             : 
     391             :         const int nOverview =
     392        2967 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     393             :                                           nBufXSize, nBufYSize, &sExtraArg);
     394        2967 :         if (nOverview >= 0)
     395             :         {
     396        2892 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     397        2892 :             if (poOverviewBand == nullptr)
     398        2892 :                 return CE_Failure;
     399             : 
     400        2892 :             return poOverviewBand->RasterIO(
     401             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     402        2892 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     403             :         }
     404             :     }
     405             : 
     406      848318 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     407           6 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     408     3885410 :         nLineSpace == nPixelSpace * nBufXSize &&
     409           6 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     410             :     {
     411           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     412           0 :         return CE_None;
     413             :     }
     414             : 
     415             :     /* ==================================================================== */
     416             :     /*      The second case when we don't need subsample data but likely    */
     417             :     /*      need data type conversion.                                      */
     418             :     /* ==================================================================== */
     419     3037080 :     if (  // nPixelSpace == nBufDataSize &&
     420     3037080 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     421             :     {
     422             : #if DEBUG_VERBOSE
     423             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     424             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     425             : #endif
     426             : 
     427             :         /* --------------------------------------------------------------------
     428             :          */
     429             :         /*      Loop over buffer computing source locations. */
     430             :         /* --------------------------------------------------------------------
     431             :          */
     432             :         // Calculate starting values out of loop
     433     2471450 :         const int nLBlockXStart = nXOff / nBlockXSize;
     434     2471450 :         const int nXSpanEnd = nBufXSize + nXOff;
     435             : 
     436     2471450 :         int nYInc = 0;
     437     4982650 :         for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
     438     2511200 :              iBufYOff += nYInc, iSrcY += nYInc)
     439             :         {
     440     2511280 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     441             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     442     2511280 :             int nLBlockY = iSrcY / nBlockYSize;
     443     2511280 :             int nLBlockX = nLBlockXStart;
     444     2511280 :             int iSrcX = nXOff;
     445     5243200 :             while (iSrcX < nXSpanEnd)
     446             :             {
     447     2732000 :                 int nXSpan = nLBlockX * nBlockXSize;
     448     2732000 :                 if (nXSpan < INT_MAX - nBlockXSize)
     449     2732000 :                     nXSpan += nBlockXSize;
     450             :                 else
     451           0 :                     nXSpan = INT_MAX;
     452     2732000 :                 const int nXRight = nXSpan;
     453     2732000 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     454             : 
     455             :                 const size_t nXSpanSize =
     456     2732000 :                     CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
     457             : 
     458     2732000 :                 bool bJustInitialize =
     459     2042260 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     460       37317 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     461     4799900 :                     nXOff <= nLBlockX * nBlockXSize &&
     462       25639 :                     nXOff + nXSize >= nXRight;
     463             : 
     464             :                 // Is this a partial tile at right and/or bottom edges of
     465             :                 // the raster, and that is going to be completely written?
     466             :                 // If so, do not load it from storage, but zero it so that
     467             :                 // the content outsize of the validity area is initialized.
     468     2732000 :                 bool bMemZeroBuffer = false;
     469     2042260 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     470     2017850 :                     nXOff <= nLBlockX * nBlockXSize &&
     471     2016200 :                     nYOff <= nLBlockY * nBlockYSize &&
     472       12152 :                     (nXOff + nXSize >= nXRight ||
     473             :                      // cppcheck-suppress knownConditionTrueFalse
     474     4776970 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     475       11972 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     476       10750 :                      (nYOff + nYSize == GetYSize() &&
     477        1958 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     478             :                 {
     479        3180 :                     bJustInitialize = true;
     480        3180 :                     bMemZeroBuffer = true;
     481             :                 }
     482             : 
     483             :                 /* --------------------------------------------------------------------
     484             :                  */
     485             :                 /*      Ensure we have the appropriate block loaded. */
     486             :                 /* --------------------------------------------------------------------
     487             :                  */
     488     2732000 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     489     2732000 :                 poBlock =
     490     2732000 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     491     2732000 :                 if (!poBlock)
     492             :                 {
     493          73 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     494             :                         nullptr)
     495             :                     {
     496           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     497             :                                  "GetBlockRef failed at X block offset %d, "
     498             :                                  "Y block offset %d%s",
     499             :                                  nLBlockX, nLBlockY,
     500           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     501           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     502             :                                      : "");
     503             :                     }
     504          73 :                     return (CE_Failure);
     505             :                 }
     506             : 
     507     2731930 :                 if (eRWFlag == GF_Write)
     508     2042260 :                     poBlock->MarkDirty();
     509             : 
     510     2731930 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     511     2731930 :                 if (bMemZeroBuffer)
     512             :                 {
     513        3180 :                     memset(pabySrcBlock, 0,
     514        3180 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     515        3180 :                                nBlockXSize * nBlockYSize);
     516             :                 }
     517             :                 /* --------------------------------------------------------------------
     518             :                  */
     519             :                 /*      Copy over this chunk of data. */
     520             :                 /* --------------------------------------------------------------------
     521             :                  */
     522     2731930 :                 GPtrDiff_t iSrcOffset =
     523     2731930 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     524     2731930 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     525     2731930 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     526     2731930 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     527     2731930 :                          nBlockXSize) *
     528     2731930 :                     nBandDataSize;
     529             :                 // Fill up as many rows as possible for the loaded block.
     530     5463860 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     531     2731930 :                                           nBufYSize - iBufYOff);
     532    59818000 :                 for (int k = 0; k < kmax; k++)
     533             :                 {
     534    57086100 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     535             :                     {
     536    53132800 :                         if (eRWFlag == GF_Read)
     537    48694800 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     538    48694800 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     539    48694800 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     540             :                         else
     541     4438030 :                             memcpy(pabySrcBlock + iSrcOffset,
     542     4438030 :                                    static_cast<GByte *>(pData) + iBufOffset +
     543     4438030 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     544             :                                    nXSpanSize);
     545             :                     }
     546             :                     else
     547             :                     {
     548             :                         /* type to type conversion */
     549     3953230 :                         if (eRWFlag == GF_Read)
     550     3896460 :                             GDALCopyWords64(
     551     3896460 :                                 pabySrcBlock + iSrcOffset, eDataType,
     552             :                                 nBandDataSize,
     553     3896460 :                                 static_cast<GByte *>(pData) + iBufOffset +
     554     3896460 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     555             :                                 eBufType, static_cast<int>(nPixelSpace),
     556             :                                 nXSpan);
     557             :                         else
     558       56776 :                             GDALCopyWords64(
     559       56776 :                                 static_cast<GByte *>(pData) + iBufOffset +
     560       56776 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     561             :                                 eBufType, static_cast<int>(nPixelSpace),
     562       56776 :                                 pabySrcBlock + iSrcOffset, eDataType,
     563             :                                 nBandDataSize, nXSpan);
     564             :                     }
     565             : 
     566    57086100 :                     iSrcOffset +=
     567    57086100 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     568             :                 }
     569             : 
     570             :                 iBufOffset =
     571     2731930 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     572     2731930 :                 nLBlockX++;
     573     2731930 :                 iSrcX += nXSpan;
     574             : 
     575     2731930 :                 poBlock->DropLock();
     576     2731930 :                 poBlock = nullptr;
     577             :             }
     578             : 
     579             :             /* Compute the increment to go on a block boundary */
     580     2511200 :             nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     581             : 
     582     2513060 :             if (psExtraArg->pfnProgress != nullptr &&
     583        1856 :                 !psExtraArg->pfnProgress(
     584     2513060 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     585             :                     psExtraArg->pProgressData))
     586             :             {
     587           0 :                 return CE_Failure;
     588             :             }
     589             :         }
     590             : 
     591     2471380 :         return CE_None;
     592             :     }
     593             : 
     594             :     /* ==================================================================== */
     595             :     /*      Loop reading required source blocks to satisfy output           */
     596             :     /*      request.  This is the most general implementation.              */
     597             :     /* ==================================================================== */
     598             : 
     599      565633 :     double dfXOff = nXOff;
     600      565633 :     double dfYOff = nYOff;
     601      565633 :     double dfXSize = nXSize;
     602      565633 :     double dfYSize = nYSize;
     603      565633 :     if (psExtraArg->bFloatingPointWindowValidity)
     604             :     {
     605      230638 :         dfXOff = psExtraArg->dfXOff;
     606      230638 :         dfYOff = psExtraArg->dfYOff;
     607      230638 :         dfXSize = psExtraArg->dfXSize;
     608      230638 :         dfYSize = psExtraArg->dfYSize;
     609             :     }
     610             : 
     611             :     /* -------------------------------------------------------------------- */
     612             :     /*      Compute stepping increment.                                     */
     613             :     /* -------------------------------------------------------------------- */
     614      565633 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     615      565633 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     616      565633 :     CPLErr eErr = CE_None;
     617             : 
     618      565633 :     if (eRWFlag == GF_Write)
     619             :     {
     620             :         /* --------------------------------------------------------------------
     621             :          */
     622             :         /*    Write case */
     623             :         /*    Loop over raster window computing source locations in the buffer.
     624             :          */
     625             :         /* --------------------------------------------------------------------
     626             :          */
     627      166655 :         GByte *pabyDstBlock = nullptr;
     628      166655 :         int nLBlockX = -1;
     629      166655 :         int nLBlockY = -1;
     630             : 
     631     1260010 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     632             :         {
     633     1093360 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     634             : 
     635    12384200 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     636             :             {
     637    11290800 :                 const int iBufXOff =
     638    11290800 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     639    11290800 :                 GPtrDiff_t iBufOffset =
     640    11290800 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     641             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     642    11290800 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     643             : 
     644             :                 // FIXME: this code likely doesn't work if the dirty block gets
     645             :                 // flushed to disk before being completely written.
     646             :                 // In the meantime, bJustInitialize should probably be set to
     647             :                 // FALSE even if it is not ideal performance wise, and for
     648             :                 // lossy compression.
     649             : 
     650             :                 /* --------------------------------------------------------------------
     651             :                  */
     652             :                 /*      Ensure we have the appropriate block loaded. */
     653             :                 /* --------------------------------------------------------------------
     654             :                  */
     655    11290800 :                 if (iDstX < nLBlockX * nBlockXSize ||
     656    11041500 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     657    10584800 :                     iDstY < nLBlockY * nBlockYSize ||
     658    10584800 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     659             :                 {
     660      738702 :                     nLBlockX = iDstX / nBlockXSize;
     661      738702 :                     nLBlockY = iDstY / nBlockYSize;
     662             : 
     663      738702 :                     const bool bJustInitialize =
     664     1065990 :                         nYOff <= nLBlockY * nBlockYSize &&
     665      327291 :                         nYOff + nYSize - nBlockYSize >=
     666      327291 :                             nLBlockY * nBlockYSize &&
     667     1116320 :                         nXOff <= nLBlockX * nBlockXSize &&
     668       50325 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     669             :                     /*bool bMemZeroBuffer = FALSE;
     670             :                     if( !bJustInitialize &&
     671             :                         nXOff <= nLBlockX * nBlockXSize &&
     672             :                         nYOff <= nLBlockY * nBlockYSize &&
     673             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     674             :                          (nXOff + nXSize == GetXSize() &&
     675             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     676             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     677             :                          (nYOff + nYSize == GetYSize() &&
     678             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     679             :                     {
     680             :                         bJustInitialize = TRUE;
     681             :                         bMemZeroBuffer = TRUE;
     682             :                     }*/
     683      738702 :                     if (poBlock != nullptr)
     684      572047 :                         poBlock->DropLock();
     685             : 
     686      738702 :                     poBlock =
     687      738702 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     688      738702 :                     if (poBlock == nullptr)
     689             :                     {
     690           0 :                         return (CE_Failure);
     691             :                     }
     692             : 
     693      738702 :                     poBlock->MarkDirty();
     694             : 
     695      738702 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     696             :                     /*if( bMemZeroBuffer )
     697             :                     {
     698             :                         memset(pabyDstBlock, 0,
     699             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     700             :                     * nBlockYSize);
     701             :                     }*/
     702             :                 }
     703             : 
     704             :                 // To make Coverity happy. Should not happen by design.
     705    11290800 :                 if (pabyDstBlock == nullptr)
     706             :                 {
     707           0 :                     CPLAssert(false);
     708             :                     eErr = CE_Failure;
     709             :                     break;
     710             :                 }
     711             : 
     712             :                 /* --------------------------------------------------------------------
     713             :                  */
     714             :                 /*      Copy over this pixel of data. */
     715             :                 /* --------------------------------------------------------------------
     716             :                  */
     717    11290800 :                 GPtrDiff_t iDstOffset =
     718    11290800 :                     (static_cast<GPtrDiff_t>(iDstX) -
     719    11290800 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     720    11290800 :                      (static_cast<GPtrDiff_t>(iDstY) -
     721    11290800 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     722    11290800 :                          nBlockXSize) *
     723    11290800 :                     nBandDataSize;
     724             : 
     725    11290800 :                 if (eDataType == eBufType)
     726             :                 {
     727    11287700 :                     memcpy(pabyDstBlock + iDstOffset,
     728    11287700 :                            static_cast<GByte *>(pData) + iBufOffset,
     729             :                            nBandDataSize);
     730             :                 }
     731             :                 else
     732             :                 {
     733             :                     /* type to type conversion ... ouch, this is expensive way
     734             :                     of handling single words */
     735        3096 :                     GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
     736        3096 :                                     eBufType, 0, pabyDstBlock + iDstOffset,
     737             :                                     eDataType, 0, 1);
     738             :                 }
     739             :             }
     740             : 
     741     1093360 :             if (psExtraArg->pfnProgress != nullptr &&
     742           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     743             :                                          psExtraArg->pProgressData))
     744             :             {
     745           0 :                 eErr = CE_Failure;
     746           0 :                 break;
     747             :             }
     748             :         }
     749             :     }
     750             :     else
     751             :     {
     752      398978 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     753             :         {
     754        9543 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     755        2719 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     756        2681 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     757        6865 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     758        3191 :                 GetColorTable() != nullptr)
     759             :             {
     760           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     761             :                          "Resampling method not supported on paletted band. "
     762             :                          "Falling back to nearest neighbour");
     763             :             }
     764        3415 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     765           3 :                      GDALDataTypeIsComplex(eDataType))
     766             :             {
     767           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     768             :                          "Resampling method not supported on complex data type "
     769             :                          "band. Falling back to nearest neighbour");
     770             :             }
     771             :             else
     772             :             {
     773        3412 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     774             :                                          pData, nBufXSize, nBufYSize, eBufType,
     775        3412 :                                          nPixelSpace, nLineSpace, psExtraArg);
     776             :             }
     777             :         }
     778             : 
     779      395566 :         int nLimitBlockY = 0;
     780      395566 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     781      395566 :         int nStartBlockX = -nBlockXSize;
     782      395566 :         constexpr double EPS = 1e-10;
     783      395566 :         int nLBlockY = -1;
     784      395566 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     785      395566 :         const bool bIntegerXFactor =
     786      372877 :             bUseIntegerRequestCoords &&
     787      669395 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     788      273829 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     789             : 
     790             :         /* --------------------------------------------------------------------
     791             :          */
     792             :         /*      Read case */
     793             :         /*      Loop over buffer computing source locations. */
     794             :         /* --------------------------------------------------------------------
     795             :          */
     796     2469430 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     797             :         {
     798             :             // Add small epsilon to avoid some numeric precision issues.
     799     2073880 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     800     2073880 :             const int iSrcY = static_cast<int>(std::min(
     801     2073880 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     802             : 
     803     2073880 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     804             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     805             : 
     806     2073880 :             if (iSrcY >= nLimitBlockY)
     807             :             {
     808      437822 :                 nLBlockY = iSrcY / nBlockYSize;
     809      437822 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     810      437822 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     811      437822 :                     nLimitBlockY += nBlockYSize;
     812             :                 else
     813           0 :                     nLimitBlockY = INT_MAX;
     814             :                 // Make sure a new block is loaded.
     815      437822 :                 nStartBlockX = -nBlockXSize;
     816             :             }
     817     1636050 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     818             :             {
     819             :                 // Make sure a new block is loaded.
     820      441987 :                 nStartBlockX = -nBlockXSize;
     821             :             }
     822             : 
     823     2073880 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     824     2073880 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     825             : 
     826     2073880 :             if (bIntegerXFactor)
     827             :             {
     828      695780 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     829      695780 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     830      695780 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     831      695780 :                 bool bRet = false;
     832      695780 :                 if (bByteCopy)
     833             :                 {
     834      585773 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     835             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     836             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
     837             :                         GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
     838             :                         nLBlockY);
     839             :                 }
     840      110007 :                 else if (eDataType == eBufType)
     841             :                 {
     842      109782 :                     switch (nBandDataSize)
     843             :                     {
     844      109630 :                         case 2:
     845      109630 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     846             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     847             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     848             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     849             :                                 nBlockXSize, poBlock, nLBlockY);
     850      109630 :                             break;
     851          54 :                         case 4:
     852          54 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     853             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     854             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     855             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     856             :                                 nBlockXSize, poBlock, nLBlockY);
     857          54 :                             break;
     858          96 :                         case 8:
     859          96 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     860             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     861             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     862             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     863             :                                 nBlockXSize, poBlock, nLBlockY);
     864          96 :                             break;
     865           2 :                         case 16:
     866           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     867             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     868             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     869             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     870             :                                 nBlockXSize, poBlock, nLBlockY);
     871           2 :                             break;
     872           0 :                         default:
     873           0 :                             CPLAssert(false);
     874             :                             break;
     875             :                     }
     876             :                 }
     877             :                 else
     878             :                 {
     879         225 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     880             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     881             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     882             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     883             :                 }
     884      695780 :                 if (!bRet)
     885           1 :                     eErr = CE_Failure;
     886             :             }
     887             :             else
     888             :             {
     889     1378100 :                 double dfSrcX = dfSrcXStart;
     890   598175000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     891   596797000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     892             :                 {
     893             :                     // TODO?: try to avoid the clamping for most iterations
     894             :                     const int iSrcX = static_cast<int>(
     895  1193590000 :                         std::min(std::max(0.0, dfSrcX),
     896   596797000 :                                  static_cast<double>(nRasterXSize - 1)));
     897             : 
     898             :                     /* --------------------------------------------------------------------
     899             :                      */
     900             :                     /*      Ensure we have the appropriate block loaded. */
     901             :                     /* --------------------------------------------------------------------
     902             :                      */
     903   596797000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     904             :                     {
     905     1706900 :                         const int nLBlockX = iSrcX / nBlockXSize;
     906     1706900 :                         nStartBlockX = nLBlockX * nBlockXSize;
     907             : 
     908     1706900 :                         if (poBlock != nullptr)
     909     1585160 :                             poBlock->DropLock();
     910             : 
     911     1706900 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     912     1706900 :                         if (poBlock == nullptr)
     913             :                         {
     914           9 :                             eErr = CE_Failure;
     915           9 :                             break;
     916             :                         }
     917             : 
     918             :                         pabySrcBlock =
     919     1706890 :                             static_cast<GByte *>(poBlock->GetDataRef());
     920             :                     }
     921   596797000 :                     const GPtrDiff_t nDiffX =
     922   596797000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     923             : 
     924             :                     /* --------------------------------------------------------------------
     925             :                      */
     926             :                     /*      Copy over this pixel of data. */
     927             :                     /* --------------------------------------------------------------------
     928             :                      */
     929             : 
     930   596797000 :                     if (bByteCopy)
     931             :                     {
     932   540998000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     933   540998000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     934   540998000 :                             pabySrcBlock[iSrcOffset];
     935             :                     }
     936    55799000 :                     else if (eDataType == eBufType)
     937             :                     {
     938    50322800 :                         GPtrDiff_t iSrcOffset =
     939    50322800 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     940    50322800 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     941    50322800 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     942             :                     }
     943             :                     else
     944             :                     {
     945             :                         // Type to type conversion ...
     946     5476160 :                         GPtrDiff_t iSrcOffset =
     947     5476160 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     948     5476160 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
     949             :                                         static_cast<GByte *>(pData) +
     950     5476160 :                                             iBufOffset,
     951             :                                         eBufType, 0, 1);
     952             :                     }
     953             : 
     954   596797000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     955             :                 }
     956             :             }
     957     2073880 :             if (eErr == CE_Failure)
     958          11 :                 break;
     959             : 
     960     2315150 :             if (psExtraArg->pfnProgress != nullptr &&
     961      241284 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     962             :                                          psExtraArg->pProgressData))
     963             :             {
     964           1 :                 eErr = CE_Failure;
     965           1 :                 break;
     966             :             }
     967             :         }
     968             :     }
     969             : 
     970      562221 :     if (poBlock != nullptr)
     971      562211 :         poBlock->DropLock();
     972             : 
     973      562221 :     return eErr;
     974             : }
     975             : 
     976             : /************************************************************************/
     977             : /*                         GDALRasterIOTransformer()                    */
     978             : /************************************************************************/
     979             : 
     980             : struct GDALRasterIOTransformerStruct
     981             : {
     982             :     double dfXOff;
     983             :     double dfYOff;
     984             :     double dfXRatioDstToSrc;
     985             :     double dfYRatioDstToSrc;
     986             : };
     987             : 
     988        6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
     989             :                                    int nPointCount, double *x, double *y,
     990             :                                    double * /* z */, int *panSuccess)
     991             : {
     992        6748 :     GDALRasterIOTransformerStruct *psParams =
     993             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
     994        6748 :     if (bDstToSrc)
     995             :     {
     996      252996 :         for (int i = 0; i < nPointCount; i++)
     997             :         {
     998      246836 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
     999      246836 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
    1000      246836 :             panSuccess[i] = TRUE;
    1001             :         }
    1002             :     }
    1003             :     else
    1004             :     {
    1005        1176 :         for (int i = 0; i < nPointCount; i++)
    1006             :         {
    1007         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
    1008         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
    1009         588 :             panSuccess[i] = TRUE;
    1010             :         }
    1011             :     }
    1012        6748 :     return TRUE;
    1013             : }
    1014             : 
    1015             : /************************************************************************/
    1016             : /*                          RasterIOResampled()                         */
    1017             : /************************************************************************/
    1018             : 
    1019             : //! @cond Doxygen_Suppress
    1020        3412 : CPLErr GDALRasterBand::RasterIOResampled(
    1021             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1022             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1023             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1024             : {
    1025             :     // Determine if we use warping resampling or overview resampling
    1026             :     const bool bUseWarp =
    1027        3412 :         (GDALDataTypeIsComplex(eDataType) &&
    1028        3571 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1029         159 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1030             : 
    1031        3412 :     double dfXOff = nXOff;
    1032        3412 :     double dfYOff = nYOff;
    1033        3412 :     double dfXSize = nXSize;
    1034        3412 :     double dfYSize = nYSize;
    1035        3412 :     if (psExtraArg->bFloatingPointWindowValidity)
    1036             :     {
    1037        2717 :         dfXOff = psExtraArg->dfXOff;
    1038        2717 :         dfYOff = psExtraArg->dfYOff;
    1039        2717 :         dfXSize = psExtraArg->dfXSize;
    1040        2717 :         dfYSize = psExtraArg->dfYSize;
    1041             :     }
    1042             : 
    1043        3412 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1044        3412 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1045             : 
    1046             :     // Determine the coordinates in the "virtual" output raster to see
    1047             :     // if there are not integers, in which case we will use them as a shift
    1048             :     // so that subwindow extracts give the exact same results as entire raster
    1049             :     // scaling.
    1050        3412 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1051        3412 :     bool bHasXOffVirtual = false;
    1052        3412 :     int nDestXOffVirtual = 0;
    1053        3412 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1054             :     {
    1055        3084 :         bHasXOffVirtual = true;
    1056        3084 :         dfXOff = nXOff;
    1057        3084 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1058             :     }
    1059             : 
    1060        3412 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1061        3412 :     bool bHasYOffVirtual = false;
    1062        3412 :     int nDestYOffVirtual = 0;
    1063        3412 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1064             :     {
    1065        3080 :         bHasYOffVirtual = true;
    1066        3080 :         dfYOff = nYOff;
    1067        3080 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1068             :     }
    1069             : 
    1070             :     // Create a MEM dataset that wraps the output buffer.
    1071             :     GDALDataset *poMEMDS;
    1072        3412 :     void *pTempBuffer = nullptr;
    1073        3412 :     GSpacing nPSMem = nPixelSpace;
    1074        3412 :     GSpacing nLSMem = nLineSpace;
    1075        3412 :     void *pDataMem = pData;
    1076        3412 :     GDALDataType eDTMem = eBufType;
    1077        3412 :     if (eBufType != eDataType)
    1078             :     {
    1079          44 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1080          44 :         nLSMem = nPSMem * nBufXSize;
    1081             :         pTempBuffer =
    1082          44 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1083          44 :         if (pTempBuffer == nullptr)
    1084           0 :             return CE_Failure;
    1085          44 :         pDataMem = pTempBuffer;
    1086          44 :         eDTMem = eDataType;
    1087             :     }
    1088             : 
    1089             :     poMEMDS =
    1090        3412 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1091             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1092        3412 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1093        3412 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1094        3412 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1095             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1096        3412 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1097             : 
    1098        3412 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1099        3412 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1100        3412 :     if (pszNBITS)
    1101           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1102           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1103             : 
    1104        3412 :     CPLErr eErr = CE_None;
    1105             : 
    1106             :     // Do the resampling.
    1107        3412 :     if (bUseWarp)
    1108             :     {
    1109         149 :         int bHasNoData = FALSE;
    1110         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1111             : 
    1112         149 :         VRTDatasetH hVRTDS = nullptr;
    1113         149 :         GDALRasterBandH hVRTBand = nullptr;
    1114         149 :         if (GetDataset() == nullptr)
    1115             :         {
    1116             :             /* Create VRT dataset that wraps the whole dataset */
    1117           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1118           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1119           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1120           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1121             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1122             :                                VRT_NODATA_UNSET);
    1123             : 
    1124             :             /* Add a mask band if needed */
    1125           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1126             :             {
    1127           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1128             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1129             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1130             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1131           0 :                             ->GetMaskBand());
    1132           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1133           0 :                                                  nRasterYSize, 0, 0,
    1134           0 :                                                  nRasterXSize, nRasterYSize);
    1135             :             }
    1136             :         }
    1137             : 
    1138         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1139         149 :         switch (psExtraArg->eResampleAlg)
    1140             :         {
    1141           0 :             case GRIORA_NearestNeighbour:
    1142           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1143           0 :                 break;
    1144         147 :             case GRIORA_Bilinear:
    1145         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1146         147 :                 break;
    1147           0 :             case GRIORA_Cubic:
    1148           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1149           0 :                 break;
    1150           0 :             case GRIORA_CubicSpline:
    1151           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1152           0 :                 break;
    1153           0 :             case GRIORA_Lanczos:
    1154           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1155           0 :                 break;
    1156           0 :             case GRIORA_Average:
    1157           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1158           0 :                 break;
    1159           2 :             case GRIORA_RMS:
    1160           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1161           2 :                 break;
    1162           0 :             case GRIORA_Mode:
    1163           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1164           0 :                 break;
    1165           0 :             default:
    1166           0 :                 CPLAssert(false);
    1167             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1168             :                 break;
    1169             :         }
    1170         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1171         149 :         psWarpOptions->hDstDS = poMEMDS;
    1172         149 :         psWarpOptions->nBandCount = 1;
    1173         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1174         149 :         int nDstBandNumber = 1;
    1175         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1176         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1177         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1178         149 :                                          ? psExtraArg->pfnProgress
    1179             :                                          : GDALDummyProgress;
    1180         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1181         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1182         149 :         if (bHasNoData)
    1183             :         {
    1184           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1185             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1186           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1187             :             {
    1188           0 :                 psWarpOptions->padfSrcNoDataReal =
    1189           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1190           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1191             :             }
    1192             : 
    1193           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1194             :             {
    1195           0 :                 psWarpOptions->padfDstNoDataReal =
    1196           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1197           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1198             :             }
    1199             :         }
    1200             : 
    1201             :         GDALRasterIOTransformerStruct sTransformer;
    1202         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1203         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1204         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1205         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1206         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1207             : 
    1208             :         GDALWarpOperationH hWarpOperation =
    1209         149 :             GDALCreateWarpOperation(psWarpOptions);
    1210         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1211             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1212         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1213             : 
    1214         149 :         psWarpOptions->panSrcBands = nullptr;
    1215         149 :         psWarpOptions->panDstBands = nullptr;
    1216         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1217             : 
    1218         149 :         if (hVRTDS)
    1219           0 :             GDALClose(hVRTDS);
    1220             :     }
    1221             :     else
    1222             :     {
    1223        3263 :         const char *pszResampling =
    1224        4254 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1225        1289 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1226         558 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1227         479 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1228         342 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1229         199 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1230          79 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1231           3 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1232             :                                                                : "UNKNOWN";
    1233             : 
    1234        3263 :         int nKernelRadius = 0;
    1235             :         GDALResampleFunction pfnResampleFunc =
    1236        3263 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1237        3263 :         CPLAssert(pfnResampleFunc);
    1238             :         GDALDataType eWrkDataType =
    1239        3263 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1240        3263 :         int nHasNoData = 0;
    1241        3263 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1242        3263 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1243        3263 :         if (!bHasNoData)
    1244        3173 :             dfNoDataValue = 0.0;
    1245             : 
    1246        3263 :         int nDstBlockXSize = nBufXSize;
    1247        3263 :         int nDstBlockYSize = nBufYSize;
    1248        3263 :         int nFullResXChunk = 0;
    1249        3263 :         int nFullResYChunk = 0;
    1250             :         while (true)
    1251             :         {
    1252        3274 :             nFullResXChunk =
    1253        3274 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1254        3274 :             nFullResYChunk =
    1255        3274 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1256        3274 :             if (nFullResXChunk > nRasterXSize)
    1257        2911 :                 nFullResXChunk = nRasterXSize;
    1258        3274 :             if (nFullResYChunk > nRasterYSize)
    1259         512 :                 nFullResYChunk = nRasterYSize;
    1260        3274 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1261        3216 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1262             :                  1024 * 1024))
    1263             :                 break;
    1264             :             // When operating on the full width of a raster whose block width is
    1265             :             // the raster width, prefer doing chunks in height.
    1266          11 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1267             :                 nDstBlockYSize > 1)
    1268           0 :                 nDstBlockYSize /= 2;
    1269             :             /* Otherwise cut the maximal dimension */
    1270          11 :             else if (nDstBlockXSize > 1 &&
    1271           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1272          11 :                 nDstBlockXSize /= 2;
    1273             :             else
    1274           0 :                 nDstBlockYSize /= 2;
    1275             :         }
    1276             : 
    1277        3263 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1278        3263 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1279        3263 :         if (nOvrXFactor == 0)
    1280        2029 :             nOvrXFactor = 1;
    1281        3263 :         if (nOvrYFactor == 0)
    1282        2028 :             nOvrYFactor = 1;
    1283        3263 :         int nFullResXSizeQueried =
    1284        3263 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1285        3263 :         int nFullResYSizeQueried =
    1286        3263 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1287             : 
    1288        3263 :         if (nFullResXSizeQueried > nRasterXSize)
    1289        2701 :             nFullResXSizeQueried = nRasterXSize;
    1290        3263 :         if (nFullResYSizeQueried > nRasterYSize)
    1291         299 :             nFullResYSizeQueried = nRasterYSize;
    1292             : 
    1293             :         void *pChunk =
    1294        3263 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1295             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1296        3263 :         GByte *pabyChunkNoDataMask = nullptr;
    1297             : 
    1298        3263 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1299        3263 :         int l_nMaskFlags = GetMaskFlags();
    1300             : 
    1301        3263 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1302        3263 :         if (bUseNoDataMask)
    1303             :         {
    1304         158 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1305             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1306             :         }
    1307        3263 :         if (pChunk == nullptr ||
    1308         158 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1309             :         {
    1310           0 :             GDALClose(poMEMDS);
    1311           0 :             CPLFree(pChunk);
    1312           0 :             CPLFree(pabyChunkNoDataMask);
    1313           0 :             VSIFree(pTempBuffer);
    1314           0 :             return CE_Failure;
    1315             :         }
    1316             : 
    1317        3263 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1318        3263 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1319        3263 :         int nBlocksDone = 0;
    1320             : 
    1321             :         int nDstYOff;
    1322        6526 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1323        3263 :              nDstYOff += nDstBlockYSize)
    1324             :         {
    1325             :             int nDstYCount;
    1326        3263 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1327        3263 :                 nDstYCount = nDstBlockYSize;
    1328             :             else
    1329           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1330             : 
    1331        3263 :             int nChunkYOff =
    1332        3263 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1333        3263 :             int nChunkYOff2 = nYOff + 1 +
    1334        3263 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1335             :                                                     dfYRatioDstToSrc));
    1336        3263 :             if (nChunkYOff2 > nRasterYSize)
    1337         660 :                 nChunkYOff2 = nRasterYSize;
    1338        3263 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1339        3263 :             CPLAssert(nYCount <= nFullResYChunk);
    1340             : 
    1341        3263 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1342        3263 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1343        3263 :             if (nChunkYOffQueried < 0)
    1344             :             {
    1345         458 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1346         458 :                 nChunkYOffQueried = 0;
    1347             :             }
    1348        3263 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1349         561 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1350        3263 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1351             : 
    1352        3263 :             int nDstXOff = 0;
    1353        6526 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1354        3263 :                  nDstXOff += nDstBlockXSize)
    1355             :             {
    1356        3263 :                 int nDstXCount = 0;
    1357        3263 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1358        3263 :                     nDstXCount = nDstBlockXSize;
    1359             :                 else
    1360           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1361             : 
    1362        3263 :                 int nChunkXOff =
    1363        3263 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1364        3263 :                 int nChunkXOff2 =
    1365        3263 :                     nXOff + 1 +
    1366        3263 :                     static_cast<int>(
    1367        3263 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1368        3263 :                 if (nChunkXOff2 > nRasterXSize)
    1369        2960 :                     nChunkXOff2 = nRasterXSize;
    1370        3263 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1371        3263 :                 CPLAssert(nXCount <= nFullResXChunk);
    1372             : 
    1373        3263 :                 int nChunkXOffQueried =
    1374        3263 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1375        3263 :                 int nChunkXSizeQueried =
    1376        3263 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1377        3263 :                 if (nChunkXOffQueried < 0)
    1378             :                 {
    1379        2762 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1380        2762 :                     nChunkXOffQueried = 0;
    1381             :                 }
    1382        3263 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1383        2748 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1384        3263 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1385             : 
    1386             :                 // Read the source buffers.
    1387        3263 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1388             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1389             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1390             :                                 eWrkDataType, 0, 0, nullptr);
    1391             : 
    1392        3263 :                 bool bSkipResample = false;
    1393        3263 :                 bool bNoDataMaskFullyOpaque = false;
    1394        3263 :                 if (eErr == CE_None && bUseNoDataMask)
    1395             :                 {
    1396         158 :                     eErr = poMaskBand->RasterIO(
    1397             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1398             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1399             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1400             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1401             : 
    1402             :                     /* Optimizations if mask if fully opaque or transparent */
    1403         158 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1404         158 :                     GByte bVal = pabyChunkNoDataMask[0];
    1405         158 :                     int i = 1;
    1406     3751650 :                     for (; i < nPixels; i++)
    1407             :                     {
    1408     3751590 :                         if (pabyChunkNoDataMask[i] != bVal)
    1409         104 :                             break;
    1410             :                     }
    1411         158 :                     if (i == nPixels)
    1412             :                     {
    1413          54 :                         if (bVal == 0)
    1414             :                         {
    1415         712 :                             for (int j = 0; j < nDstYCount; j++)
    1416             :                             {
    1417         686 :                                 GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
    1418             :                                                 static_cast<GByte *>(pDataMem) +
    1419         686 :                                                     nLSMem * (j + nDstYOff) +
    1420         686 :                                                     nDstXOff * nPSMem,
    1421             :                                                 eDTMem,
    1422             :                                                 static_cast<int>(nPSMem),
    1423             :                                                 nDstXCount);
    1424             :                             }
    1425          26 :                             bSkipResample = true;
    1426             :                         }
    1427             :                         else
    1428             :                         {
    1429          28 :                             bNoDataMaskFullyOpaque = true;
    1430             :                         }
    1431             :                     }
    1432             :                 }
    1433             : 
    1434        3263 :                 if (!bSkipResample && eErr == CE_None)
    1435             :                 {
    1436        3234 :                     const bool bPropagateNoData = false;
    1437        3234 :                     void *pDstBuffer = nullptr;
    1438        3234 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1439             :                     GDALRasterBand *poMEMBand =
    1440        3234 :                         GDALRasterBand::FromHandle(hMEMBand);
    1441        3234 :                     GDALOverviewResampleArgs args;
    1442        3234 :                     args.eSrcDataType = eDataType;
    1443        3234 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1444        3234 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1445        3234 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1446        3234 :                     args.nOvrNBITS = nNBITS;
    1447        3234 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1448        3234 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1449        3234 :                     args.dfSrcXDelta =
    1450        3234 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1451        3234 :                     args.dfSrcYDelta =
    1452        3234 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1453        3234 :                     args.eWrkDataType = eWrkDataType;
    1454        3234 :                     args.pabyChunkNodataMask =
    1455        3234 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1456        3234 :                     args.nChunkXOff =
    1457        3234 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1458        3234 :                     args.nChunkXSize = nChunkXSizeQueried;
    1459        3234 :                     args.nChunkYOff =
    1460        3234 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1461        3234 :                     args.nChunkYSize = nChunkYSizeQueried;
    1462        3234 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1463        3234 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1464        3234 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1465        3234 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1466        3234 :                     args.pszResampling = pszResampling;
    1467        3234 :                     args.bHasNoData = bHasNoData;
    1468        3234 :                     args.dfNoDataValue = dfNoDataValue;
    1469        3234 :                     args.poColorTable = GetColorTable();
    1470        3234 :                     args.bPropagateNoData = bPropagateNoData;
    1471        3234 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1472             :                                            &eDstBufferDataType);
    1473        3234 :                     if (eErr == CE_None)
    1474             :                     {
    1475        3234 :                         eErr = poMEMBand->RasterIO(
    1476             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1477             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1478             :                             pDstBuffer, nDstXCount, nDstYCount,
    1479             :                             eDstBufferDataType, 0, 0, nullptr);
    1480             :                     }
    1481        3234 :                     CPLFree(pDstBuffer);
    1482             :                 }
    1483             : 
    1484        3263 :                 nBlocksDone++;
    1485        3689 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1486         426 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1487             :                                              "", psExtraArg->pProgressData))
    1488             :                 {
    1489           1 :                     eErr = CE_Failure;
    1490             :                 }
    1491             :             }
    1492             :         }
    1493             : 
    1494        3263 :         CPLFree(pChunk);
    1495        3263 :         CPLFree(pabyChunkNoDataMask);
    1496             :     }
    1497             : 
    1498        3412 :     if (eBufType != eDataType)
    1499             :     {
    1500          44 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1501             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1502             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1503             :             nullptr));
    1504             :     }
    1505        3412 :     GDALClose(poMEMDS);
    1506        3412 :     VSIFree(pTempBuffer);
    1507             : 
    1508        3412 :     return eErr;
    1509             : }
    1510             : 
    1511             : /************************************************************************/
    1512             : /*                          RasterIOResampled()                         */
    1513             : /************************************************************************/
    1514             : 
    1515         886 : CPLErr GDALDataset::RasterIOResampled(
    1516             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1517             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1518             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1519             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1520             : 
    1521             : {
    1522             : #if 0
    1523             :     // Determine if we use warping resampling or overview resampling
    1524             :     bool bUseWarp = false;
    1525             :     if( GDALDataTypeIsComplex( eDataType ) )
    1526             :         bUseWarp = true;
    1527             : #endif
    1528             : 
    1529         886 :     double dfXOff = nXOff;
    1530         886 :     double dfYOff = nYOff;
    1531         886 :     double dfXSize = nXSize;
    1532         886 :     double dfYSize = nYSize;
    1533         886 :     if (psExtraArg->bFloatingPointWindowValidity)
    1534             :     {
    1535         765 :         dfXOff = psExtraArg->dfXOff;
    1536         765 :         dfYOff = psExtraArg->dfYOff;
    1537         765 :         dfXSize = psExtraArg->dfXSize;
    1538         765 :         dfYSize = psExtraArg->dfYSize;
    1539             :     }
    1540             : 
    1541         886 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1542         886 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1543             : 
    1544             :     // Determine the coordinates in the "virtual" output raster to see
    1545             :     // if there are not integers, in which case we will use them as a shift
    1546             :     // so that subwindow extracts give the exact same results as entire raster
    1547             :     // scaling.
    1548         886 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1549         886 :     bool bHasXOffVirtual = false;
    1550         886 :     int nDestXOffVirtual = 0;
    1551         886 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1552             :     {
    1553         761 :         bHasXOffVirtual = true;
    1554         761 :         dfXOff = nXOff;
    1555         761 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1556             :     }
    1557             : 
    1558         886 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1559         886 :     bool bHasYOffVirtual = false;
    1560         886 :     int nDestYOffVirtual = 0;
    1561         886 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1562             :     {
    1563         721 :         bHasYOffVirtual = true;
    1564         721 :         dfYOff = nYOff;
    1565         721 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1566             :     }
    1567             : 
    1568             :     // Create a MEM dataset that wraps the output buffer.
    1569             :     GDALDataset *poMEMDS =
    1570         886 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1571             :                            nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
    1572             :     GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
    1573         886 :         CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
    1574         886 :     int nNBITS = 0;
    1575        2878 :     for (int i = 0; i < nBandCount; i++)
    1576             :     {
    1577        1992 :         char szBuffer[32] = {'\0'};
    1578        3984 :         int nRet = CPLPrintPointer(
    1579             :             szBuffer,
    1580        1992 :             static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
    1581        1992 :                 nLineSpace * nDestYOffVirtual + nBandSpace * i,
    1582             :             sizeof(szBuffer));
    1583        1992 :         szBuffer[nRet] = 0;
    1584             : 
    1585        1992 :         char szBuffer0[64] = {'\0'};
    1586        1992 :         snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
    1587             : 
    1588        1992 :         char szBuffer1[64] = {'\0'};
    1589        1992 :         snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
    1590             :                  static_cast<GIntBig>(nPixelSpace));
    1591             : 
    1592        1992 :         char szBuffer2[64] = {'\0'};
    1593        1992 :         snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
    1594             :                  static_cast<GIntBig>(nLineSpace));
    1595             : 
    1596        1992 :         char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
    1597             : 
    1598        1992 :         poMEMDS->AddBand(eBufType, apszOptions);
    1599             : 
    1600        1992 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1601        1992 :         papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
    1602             :         const char *pszNBITS =
    1603        1992 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1604        1992 :         if (pszNBITS)
    1605             :         {
    1606           0 :             nNBITS = atoi(pszNBITS);
    1607           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1608           0 :                                                            "IMAGE_STRUCTURE");
    1609             :         }
    1610             :     }
    1611             : 
    1612         886 :     CPLErr eErr = CE_None;
    1613             : 
    1614             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1615             :     // Looks like this code was initially added as disable by copying
    1616             :     // from RasterIO here:
    1617             :     // https://trac.osgeo.org/gdal/changeset/29572
    1618             : #if 0
    1619             :     // Do the resampling.
    1620             :     if( bUseWarp )
    1621             :     {
    1622             :         VRTDatasetH hVRTDS = nullptr;
    1623             :         GDALRasterBandH hVRTBand = nullptr;
    1624             :         if( GetDataset() == nullptr )
    1625             :         {
    1626             :             /* Create VRT dataset that wraps the whole dataset */
    1627             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1628             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1629             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1630             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1631             :                                 (GDALRasterBandH)this,
    1632             :                                 0, 0,
    1633             :                                 nRasterXSize, nRasterYSize,
    1634             :                                 0, 0,
    1635             :                                 nRasterXSize, nRasterYSize,
    1636             :                                 nullptr, VRT_NODATA_UNSET );
    1637             : 
    1638             :             /* Add a mask band if needed */
    1639             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1640             :             {
    1641             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1642             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1643             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1644             :                 poVRTMaskBand->
    1645             :                     AddMaskBandSource( this,
    1646             :                                     0, 0,
    1647             :                                     nRasterXSize, nRasterYSize,
    1648             :                                     0, 0,
    1649             :                                     nRasterXSize, nRasterYSize);
    1650             :             }
    1651             :         }
    1652             : 
    1653             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1654             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1655             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1656             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1657             :         psWarpOptions->nBandCount = 1;
    1658             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1659             :         int nDstBandNumber = 1;
    1660             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1661             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1662             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1663             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1664             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1665             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1666             :         GDALRasterIOTransformerStruct sTransformer;
    1667             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1668             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1669             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1670             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1671             :         psWarpOptions->pTransformerArg = &sTransformer;
    1672             : 
    1673             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1674             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1675             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1676             :                                       nBufXSize, nBufYSize );
    1677             :         GDALDestroyWarpOperation( hWarpOperation );
    1678             : 
    1679             :         psWarpOptions->panSrcBands = nullptr;
    1680             :         psWarpOptions->panDstBands = nullptr;
    1681             :         GDALDestroyWarpOptions( psWarpOptions );
    1682             : 
    1683             :         if( hVRTDS )
    1684             :             GDALClose(hVRTDS);
    1685             :     }
    1686             :     else
    1687             : #endif
    1688             :     {
    1689         886 :         const char *pszResampling =
    1690        1653 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1691         767 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1692           0 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1693           0 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1694           0 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1695           0 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1696           0 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1697           0 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1698             :                                                                : "UNKNOWN";
    1699             : 
    1700         886 :         GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1701         886 :         GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1702             :         int nBlockXSize, nBlockYSize;
    1703         886 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1704             : 
    1705             :         int nKernelRadius;
    1706             :         GDALResampleFunction pfnResampleFunc =
    1707         886 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1708         886 :         CPLAssert(pfnResampleFunc);
    1709             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1710             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1711             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1712             : #endif
    1713             :         GDALDataType eWrkDataType =
    1714         886 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1715             : 
    1716         886 :         int nDstBlockXSize = nBufXSize;
    1717         886 :         int nDstBlockYSize = nBufYSize;
    1718             :         int nFullResXChunk, nFullResYChunk;
    1719             :         while (true)
    1720             :         {
    1721         886 :             nFullResXChunk =
    1722         886 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1723         886 :             nFullResYChunk =
    1724         886 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1725         886 :             if (nFullResXChunk > nRasterXSize)
    1726         585 :                 nFullResXChunk = nRasterXSize;
    1727         886 :             if (nFullResYChunk > nRasterYSize)
    1728          51 :                 nFullResYChunk = nRasterYSize;
    1729         886 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1730         884 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1731             :                  1024 * 1024))
    1732             :                 break;
    1733             :             // When operating on the full width of a raster whose block width is
    1734             :             // the raster width, prefer doing chunks in height.
    1735           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1736             :                 nDstBlockYSize > 1)
    1737           0 :                 nDstBlockYSize /= 2;
    1738             :             /* Otherwise cut the maximal dimension */
    1739           0 :             else if (nDstBlockXSize > 1 &&
    1740           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1741           0 :                 nDstBlockXSize /= 2;
    1742             :             else
    1743           0 :                 nDstBlockYSize /= 2;
    1744             :         }
    1745             : 
    1746        1772 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1747         886 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1748         886 :         if (nOvrFactor == 0)
    1749         104 :             nOvrFactor = 1;
    1750         886 :         int nFullResXSizeQueried =
    1751         886 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1752         886 :         int nFullResYSizeQueried =
    1753         886 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1754             : 
    1755         886 :         if (nFullResXSizeQueried > nRasterXSize)
    1756         610 :             nFullResXSizeQueried = nRasterXSize;
    1757         886 :         if (nFullResYSizeQueried > nRasterYSize)
    1758          54 :             nFullResYSizeQueried = nRasterYSize;
    1759             : 
    1760         886 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1761             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1762             :                               nBandCount),
    1763             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1764         886 :         GByte *pabyChunkNoDataMask = nullptr;
    1765             : 
    1766         886 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1767         886 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1768             : 
    1769         886 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1770         886 :         if (bUseNoDataMask)
    1771             :         {
    1772         617 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1773             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1774             :         }
    1775         886 :         if (pChunk == nullptr ||
    1776         617 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1777             :         {
    1778           0 :             GDALClose(poMEMDS);
    1779           0 :             CPLFree(pChunk);
    1780           0 :             CPLFree(pabyChunkNoDataMask);
    1781           0 :             CPLFree(papoDstBands);
    1782           0 :             return CE_Failure;
    1783             :         }
    1784             : 
    1785         886 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1786         886 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1787         886 :         int nBlocksDone = 0;
    1788             : 
    1789             :         int nDstYOff;
    1790        1772 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1791         886 :              nDstYOff += nDstBlockYSize)
    1792             :         {
    1793             :             int nDstYCount;
    1794         886 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1795         886 :                 nDstYCount = nDstBlockYSize;
    1796             :             else
    1797           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1798             : 
    1799         886 :             int nChunkYOff =
    1800         886 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1801         886 :             int nChunkYOff2 = nYOff + 1 +
    1802         886 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1803             :                                                     dfYRatioDstToSrc));
    1804         886 :             if (nChunkYOff2 > nRasterYSize)
    1805         133 :                 nChunkYOff2 = nRasterYSize;
    1806         886 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1807         886 :             CPLAssert(nYCount <= nFullResYChunk);
    1808             : 
    1809         886 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1810         886 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1811         886 :             if (nChunkYOffQueried < 0)
    1812             :             {
    1813         136 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1814         136 :                 nChunkYOffQueried = 0;
    1815             :             }
    1816         886 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1817         151 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1818         886 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1819             : 
    1820             :             int nDstXOff;
    1821        1772 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1822         886 :                  nDstXOff += nDstBlockXSize)
    1823             :             {
    1824             :                 int nDstXCount;
    1825         886 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1826         886 :                     nDstXCount = nDstBlockXSize;
    1827             :                 else
    1828           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1829             : 
    1830         886 :                 int nChunkXOff =
    1831         886 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1832         886 :                 int nChunkXOff2 =
    1833         886 :                     nXOff + 1 +
    1834         886 :                     static_cast<int>(
    1835         886 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1836         886 :                 if (nChunkXOff2 > nRasterXSize)
    1837         641 :                     nChunkXOff2 = nRasterXSize;
    1838         886 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1839         886 :                 CPLAssert(nXCount <= nFullResXChunk);
    1840             : 
    1841         886 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1842         886 :                 int nChunkXSizeQueried =
    1843         886 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1844         886 :                 if (nChunkXOffQueried < 0)
    1845             :                 {
    1846         641 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1847         641 :                     nChunkXOffQueried = 0;
    1848             :                 }
    1849         886 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1850         649 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1851         886 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1852             : 
    1853         886 :                 bool bSkipResample = false;
    1854         886 :                 bool bNoDataMaskFullyOpaque = false;
    1855         886 :                 if (eErr == CE_None && bUseNoDataMask)
    1856             :                 {
    1857         617 :                     eErr = poMaskBand->RasterIO(
    1858             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1859             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1860             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1861             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1862             : 
    1863             :                     /* Optimizations if mask if fully opaque or transparent */
    1864         617 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1865         617 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1866         617 :                     int i = 1;  // Used after for.
    1867    48197000 :                     for (; i < nPixels; i++)
    1868             :                     {
    1869    48196500 :                         if (pabyChunkNoDataMask[i] != bVal)
    1870          72 :                             break;
    1871             :                     }
    1872         617 :                     if (i == nPixels)
    1873             :                     {
    1874         545 :                         if (bVal == 0)
    1875             :                         {
    1876         373 :                             GByte abyZero[16] = {0};
    1877         780 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1878             :                             {
    1879        3499 :                                 for (int j = 0; j < nDstYCount; j++)
    1880             :                                 {
    1881        3092 :                                     GDALCopyWords64(
    1882             :                                         abyZero, GDT_UInt8, 0,
    1883             :                                         static_cast<GByte *>(pData) +
    1884        3092 :                                             iBand * nBandSpace +
    1885        3092 :                                             nLineSpace * (j + nDstYOff) +
    1886        3092 :                                             nDstXOff * nPixelSpace,
    1887             :                                         eBufType, static_cast<int>(nPixelSpace),
    1888             :                                         nDstXCount);
    1889             :                                 }
    1890             :                             }
    1891         373 :                             bSkipResample = true;
    1892             :                         }
    1893             :                         else
    1894             :                         {
    1895         172 :                             bNoDataMaskFullyOpaque = true;
    1896             :                         }
    1897             :                     }
    1898             :                 }
    1899             : 
    1900         886 :                 if (!bSkipResample && eErr == CE_None)
    1901             :                 {
    1902             :                     /* Read the source buffers */
    1903         510 :                     eErr = RasterIO(
    1904             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1905             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1906             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1907             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1908             :                 }
    1909             : 
    1910             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1911             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1912             :                     eErr == CE_None)
    1913             :                 {
    1914             :                     eErr = pfnResampleFuncMultiBands(
    1915             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1916             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1917             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1918             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1919             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1920             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1921             :                         nChunkXSizeQueried,
    1922             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1923             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1924             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1925             :                         nDstYOff + nDestYOffVirtual,
    1926             :                         nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
    1927             :                         pszResampling, FALSE /*bHasNoData*/,
    1928             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1929             :                         eDataType);
    1930             :                 }
    1931             :                 else
    1932             : #endif
    1933             :                 {
    1934             :                     size_t nChunkBandOffset =
    1935         886 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1936         886 :                         nChunkYSizeQueried *
    1937         886 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1938        2462 :                     for (int i = 0;
    1939        2462 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1940             :                          i++)
    1941             :                     {
    1942        1576 :                         const bool bPropagateNoData = false;
    1943        1576 :                         void *pDstBuffer = nullptr;
    1944        1576 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1945             :                         GDALRasterBand *poMEMBand =
    1946        1576 :                             poMEMDS->GetRasterBand(i + 1);
    1947        1576 :                         GDALOverviewResampleArgs args;
    1948        1576 :                         args.eSrcDataType = eDataType;
    1949        1576 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1950        1576 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1951        1576 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1952        1576 :                         args.nOvrNBITS = nNBITS;
    1953        1576 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1954        1576 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1955        1576 :                         args.dfSrcXDelta =
    1956        1576 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1957        1576 :                         args.dfSrcYDelta =
    1958        1576 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1959        1576 :                         args.eWrkDataType = eWrkDataType;
    1960        1576 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1961        1576 :                                                        ? nullptr
    1962             :                                                        : pabyChunkNoDataMask;
    1963        1576 :                         args.nChunkXOff =
    1964        1576 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1965        1576 :                         args.nChunkXSize = nChunkXSizeQueried;
    1966        1576 :                         args.nChunkYOff =
    1967        1576 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1968        1576 :                         args.nChunkYSize = nChunkYSizeQueried;
    1969        1576 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1970        1576 :                         args.nDstXOff2 =
    1971        1576 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1972        1576 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1973        1576 :                         args.nDstYOff2 =
    1974        1576 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1975        1576 :                         args.pszResampling = pszResampling;
    1976        1576 :                         args.bHasNoData = false;
    1977        1576 :                         args.dfNoDataValue = 0.0;
    1978        1576 :                         args.poColorTable = nullptr;
    1979        1576 :                         args.bPropagateNoData = bPropagateNoData;
    1980             : 
    1981             :                         eErr =
    1982        3152 :                             pfnResampleFunc(args,
    1983        1576 :                                             reinterpret_cast<GByte *>(pChunk) +
    1984        1576 :                                                 i * nChunkBandOffset,
    1985             :                                             &pDstBuffer, &eDstBufferDataType);
    1986        1576 :                         if (eErr == CE_None)
    1987             :                         {
    1988        1576 :                             eErr = poMEMBand->RasterIO(
    1989             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1990             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1991             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1992             :                                 eDstBufferDataType, 0, 0, nullptr);
    1993             :                         }
    1994        1576 :                         CPLFree(pDstBuffer);
    1995             :                     }
    1996             :                 }
    1997             : 
    1998         886 :                 nBlocksDone++;
    1999        1275 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    2000         389 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    2001             :                                              "", psExtraArg->pProgressData))
    2002             :                 {
    2003           0 :                     eErr = CE_Failure;
    2004             :                 }
    2005             :             }
    2006             :         }
    2007             : 
    2008         886 :         CPLFree(pChunk);
    2009         886 :         CPLFree(pabyChunkNoDataMask);
    2010             :     }
    2011             : 
    2012         886 :     CPLFree(papoDstBands);
    2013         886 :     GDALClose(poMEMDS);
    2014             : 
    2015         886 :     return eErr;
    2016             : }
    2017             : 
    2018             : //! @endcond
    2019             : 
    2020             : /************************************************************************/
    2021             : /*                           GDALSwapWords()                            */
    2022             : /************************************************************************/
    2023             : 
    2024             : /**
    2025             :  * Byte swap words in-place.
    2026             :  *
    2027             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2028             :  * a memory array.  No assumption is made that the words being swapped are
    2029             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2030             :  * to determine if the current platform is big endian or little endian.  Use
    2031             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2032             :  * of a function call.
    2033             :  *
    2034             :  * @param pData pointer to start of data buffer.
    2035             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2036             :  * @param nWordCount the number of words to be swapped in this call.
    2037             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2038             :  * the next. For packed buffers this is the same as nWordSize.
    2039             :  */
    2040             : 
    2041      497143 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2042             :                                int nWordSkip)
    2043             : 
    2044             : {
    2045      497143 :     if (nWordCount > 0)
    2046      497143 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2047             : 
    2048      497143 :     GByte *pabyData = static_cast<GByte *>(pData);
    2049             : 
    2050      497143 :     switch (nWordSize)
    2051             :     {
    2052        7234 :         case 1:
    2053        7234 :             break;
    2054             : 
    2055      476903 :         case 2:
    2056      476903 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2057   228062000 :             for (int i = 0; i < nWordCount; i++)
    2058             :             {
    2059   227585000 :                 CPL_SWAP16PTR(pabyData);
    2060   227585000 :                 pabyData += nWordSkip;
    2061             :             }
    2062      476903 :             break;
    2063             : 
    2064       10580 :         case 4:
    2065       10580 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2066       10580 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2067             :             {
    2068    29140500 :                 for (int i = 0; i < nWordCount; i++)
    2069             :                 {
    2070    29130000 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2071             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2072    29130000 :                     pabyData += nWordSkip;
    2073       10577 :                 }
    2074             :             }
    2075             :             else
    2076             :             {
    2077           9 :                 for (int i = 0; i < nWordCount; i++)
    2078             :                 {
    2079           6 :                     CPL_SWAP32PTR(pabyData);
    2080           6 :                     pabyData += nWordSkip;
    2081             :                 }
    2082             :             }
    2083       10580 :             break;
    2084             : 
    2085        2426 :         case 8:
    2086        2426 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2087        2426 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2088             :             {
    2089     3356900 :                 for (int i = 0; i < nWordCount; i++)
    2090             :                 {
    2091     3354480 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2092             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2093     3354480 :                     pabyData += nWordSkip;
    2094        2425 :                 }
    2095             :             }
    2096             :             else
    2097             :             {
    2098           3 :                 for (int i = 0; i < nWordCount; i++)
    2099             :                 {
    2100           2 :                     CPL_SWAP64PTR(pabyData);
    2101           2 :                     pabyData += nWordSkip;
    2102             :                 }
    2103             :             }
    2104        2426 :             break;
    2105             : 
    2106           0 :         default:
    2107           0 :             CPLAssert(false);
    2108             :     }
    2109             : }
    2110             : 
    2111             : /************************************************************************/
    2112             : /*                           GDALSwapWordsEx()                          */
    2113             : /************************************************************************/
    2114             : 
    2115             : /**
    2116             :  * Byte swap words in-place.
    2117             :  *
    2118             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2119             :  * a memory array.  No assumption is made that the words being swapped are
    2120             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2121             :  * to determine if the current platform is big endian or little endian.  Use
    2122             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2123             :  * of a function call.
    2124             :  *
    2125             :  * @param pData pointer to start of data buffer.
    2126             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2127             :  * @param nWordCount the number of words to be swapped in this call.
    2128             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2129             :  * the next. For packed buffers this is the same as nWordSize.
    2130             :  */
    2131        6124 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2132             :                                  int nWordSkip)
    2133             : {
    2134        6124 :     GByte *pabyData = static_cast<GByte *>(pData);
    2135       12248 :     while (nWordCount)
    2136             :     {
    2137             :         // Pick-up a multiple of 8 as max chunk size.
    2138        6124 :         const int nWordCountSmall =
    2139        6124 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2140        6124 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2141        6124 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2142        6124 :         nWordCount -= nWordCountSmall;
    2143             :     }
    2144        6124 : }
    2145             : 
    2146             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2147             : namespace
    2148             : {
    2149             : 
    2150             : /************************************************************************/
    2151             : /*                           GDALCopyWordsT()                           */
    2152             : /************************************************************************/
    2153             : /**
    2154             :  * Template function, used to copy data from pSrcData into buffer
    2155             :  * pDstData, with stride nSrcPixelStride in the source data and
    2156             :  * stride nDstPixelStride in the destination data. This template can
    2157             :  * deal with the case where the input data type is real or complex and
    2158             :  * the output is real.
    2159             :  *
    2160             :  * @param pSrcData the source data buffer
    2161             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2162             :  *                      of interest.
    2163             :  * @param pDstData the destination buffer.
    2164             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2165             :  *                      interest.
    2166             :  * @param nWordCount the total number of pixel words to copy
    2167             :  *
    2168             :  * @code
    2169             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2170             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2171             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2172             :  * @endcode
    2173             :  * @note
    2174             :  * This is a private function, and should not be exposed outside of
    2175             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2176             :  */
    2177             : 
    2178             : template <class Tin, class Tout>
    2179    42454229 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2180             :                                          int nSrcPixelStride,
    2181             :                                          Tout *const CPL_RESTRICT pDstData,
    2182             :                                          int nDstPixelStride,
    2183             :                                          GPtrDiff_t nWordCount)
    2184             : {
    2185    42454229 :     decltype(nWordCount) nDstOffset = 0;
    2186             : 
    2187    42454229 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2188    42454229 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2189   384343861 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2190             :     {
    2191   341889564 :         const Tin tValue =
    2192   341889564 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2193   341889564 :         Tout *const pOutPixel =
    2194   341889564 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2195             : 
    2196   341889564 :         GDALCopyWord(tValue, *pOutPixel);
    2197             : 
    2198   341889564 :         nDstOffset += nDstPixelStride;
    2199             :     }
    2200    42454229 : }
    2201             : 
    2202             : template <class Tin, class Tout>
    2203    29786219 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2204             :                                         int nSrcPixelStride,
    2205             :                                         Tout *const CPL_RESTRICT pDstData,
    2206             :                                         int nDstPixelStride,
    2207             :                                         GPtrDiff_t nWordCount)
    2208             : {
    2209    29786219 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2210             :                           nWordCount);
    2211    29786219 : }
    2212             : 
    2213             : template <class Tin, class Tout>
    2214     5076559 : static void inline GDALCopyWordsT_8atatime(
    2215             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2216             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2217             :     GPtrDiff_t nWordCount)
    2218             : {
    2219     5076559 :     decltype(nWordCount) nDstOffset = 0;
    2220             : 
    2221     5076559 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2222     5076559 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2223     5076559 :     decltype(nWordCount) n = 0;
    2224     5076559 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2225             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2226             :     {
    2227    57868365 :         for (; n < nWordCount - 7; n += 8)
    2228             :         {
    2229    57324286 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2230    57324286 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2231    57324286 :             Tout *const pOutPixels =
    2232    57324286 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2233             : 
    2234    57324286 :             GDALCopy8Words(pInValues, pOutPixels);
    2235             : 
    2236    57324286 :             nDstOffset += 8 * nDstPixelStride;
    2237             :         }
    2238             :     }
    2239    10454636 :     for (; n < nWordCount; n++)
    2240             :     {
    2241     5378077 :         const Tin tValue =
    2242     5378077 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2243     5378077 :         Tout *const pOutPixel =
    2244     5378077 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2245             : 
    2246     5378077 :         GDALCopyWord(tValue, *pOutPixel);
    2247             : 
    2248     5378077 :         nDstOffset += nDstPixelStride;
    2249             :     }
    2250     5076559 : }
    2251             : 
    2252             : #ifdef HAVE_SSE2
    2253             : 
    2254             : template <class Tout>
    2255       39717 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2256             :                               int nSrcPixelStride,
    2257             :                               Tout *const CPL_RESTRICT pDstData,
    2258             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2259             : {
    2260             :     static_assert(std::is_integral<Tout>::value &&
    2261             :                       sizeof(Tout) == sizeof(uint16_t),
    2262             :                   "Bad Tout");
    2263       39717 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2264             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2265             :     {
    2266       33366 :         decltype(nWordCount) n = 0;
    2267       33366 :         const __m128i xmm_zero = _mm_setzero_si128();
    2268       33366 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2269             :             reinterpret_cast<GByte *>(pDstData);
    2270     1415762 :         for (; n < nWordCount - 15; n += 16)
    2271             :         {
    2272     1382396 :             __m128i xmm = _mm_loadu_si128(
    2273     1382396 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2274     1382396 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2275     1382396 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2276             :             _mm_storeu_si128(
    2277     1382396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2278             :             _mm_storeu_si128(
    2279     1382396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2280             :         }
    2281      109389 :         for (; n < nWordCount; n++)
    2282             :         {
    2283       76023 :             pDstData[n] = pSrcData[n];
    2284       33366 :         }
    2285             :     }
    2286             :     else
    2287             :     {
    2288        6351 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2289             :                               nDstPixelStride, nWordCount);
    2290             :     }
    2291       39717 : }
    2292             : 
    2293             : template <>
    2294       26977 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2295             :                                  int nSrcPixelStride,
    2296             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    2297             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2298             : {
    2299       26977 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2300             :                              nDstPixelStride, nWordCount);
    2301       26977 : }
    2302             : 
    2303             : template <>
    2304       12740 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2305             :                                  int nSrcPixelStride,
    2306             :                                  GInt16 *const CPL_RESTRICT pDstData,
    2307             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2308             : {
    2309       12740 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2310             :                              nDstPixelStride, nWordCount);
    2311       12740 : }
    2312             : 
    2313             : template <class Tout>
    2314    12854476 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2315             :                               int nSrcPixelStride,
    2316             :                               Tout *const CPL_RESTRICT pDstData,
    2317             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2318             : {
    2319             :     static_assert(std::is_integral<Tout>::value &&
    2320             :                       sizeof(Tout) == sizeof(uint32_t),
    2321             :                   "Bad Tout");
    2322    12854476 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2323             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2324             :     {
    2325     6293756 :         decltype(nWordCount) n = 0;
    2326     6293756 :         const __m128i xmm_zero = _mm_setzero_si128();
    2327     6293756 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2328             :             reinterpret_cast<GByte *>(pDstData);
    2329    70192427 :         for (; n < nWordCount - 15; n += 16)
    2330             :         {
    2331    63898661 :             __m128i xmm = _mm_loadu_si128(
    2332    63898661 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2333    63898661 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2334    63898661 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2335    63898661 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2336    63898661 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2337    63898661 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2338    63898661 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2339             :             _mm_storeu_si128(
    2340    63898661 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2341             :             _mm_storeu_si128(
    2342    63898661 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2343             :             _mm_storeu_si128(
    2344    63898661 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2345             :             _mm_storeu_si128(
    2346    63898661 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2347             :         }
    2348    14581316 :         for (; n < nWordCount; n++)
    2349             :         {
    2350     8287610 :             pDstData[n] = pSrcData[n];
    2351     6293756 :         }
    2352             :     }
    2353             :     else
    2354             :     {
    2355     6560690 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2356             :                               nDstPixelStride, nWordCount);
    2357             :     }
    2358    12854476 : }
    2359             : 
    2360             : template <>
    2361         476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2362             :                                  int nSrcPixelStride,
    2363             :                                  GUInt32 *const CPL_RESTRICT pDstData,
    2364             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2365             : {
    2366         476 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2367             :                              nDstPixelStride, nWordCount);
    2368         476 : }
    2369             : 
    2370             : template <>
    2371    12854000 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2372             :                                  int nSrcPixelStride,
    2373             :                                  GInt32 *const CPL_RESTRICT pDstData,
    2374             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2375             : {
    2376    12854000 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2377             :                              nDstPixelStride, nWordCount);
    2378    12854000 : }
    2379             : 
    2380             : template <>
    2381     2476020 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2382             :                                  int nSrcPixelStride,
    2383             :                                  float *const CPL_RESTRICT pDstData,
    2384             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2385             : {
    2386     2476020 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2387             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2388             :     {
    2389      115285 :         decltype(nWordCount) n = 0;
    2390      115285 :         const __m128i xmm_zero = _mm_setzero_si128();
    2391      115285 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2392             :             reinterpret_cast<GByte *>(pDstData);
    2393     3324090 :         for (; n < nWordCount - 15; n += 16)
    2394             :         {
    2395     3208800 :             __m128i xmm = _mm_loadu_si128(
    2396     3208800 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2397     3208800 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2398     3208800 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2399     3208800 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2400     3208800 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2401     3208800 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2402     3208800 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2403     3208800 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2404     3208800 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2405     3208800 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2406     3208800 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2407     3208800 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2408             :                           xmm0_f);
    2409             :             _mm_storeu_ps(
    2410     3208800 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2411             :             _mm_storeu_ps(
    2412     3208800 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2413             :             _mm_storeu_ps(
    2414     3208800 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2415             :         }
    2416      502808 :         for (; n < nWordCount; n++)
    2417             :         {
    2418      387523 :             pDstData[n] = pSrcData[n];
    2419      115285 :         }
    2420             :     }
    2421             :     else
    2422             :     {
    2423     2360740 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2424             :                               nDstPixelStride, nWordCount);
    2425             :     }
    2426     2476020 : }
    2427             : 
    2428             : template <>
    2429      169970 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2430             :                                  int nSrcPixelStride,
    2431             :                                  double *const CPL_RESTRICT pDstData,
    2432             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2433             : {
    2434      169970 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2435             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2436             :     {
    2437      146506 :         decltype(nWordCount) n = 0;
    2438      146506 :         const __m128i xmm_zero = _mm_setzero_si128();
    2439      146506 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2440             :             reinterpret_cast<GByte *>(pDstData);
    2441     3126180 :         for (; n < nWordCount - 15; n += 16)
    2442             :         {
    2443     2979670 :             __m128i xmm = _mm_loadu_si128(
    2444     2979670 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2445     2979670 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2446     2979670 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2447     2979670 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2448     2979670 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2449     2979670 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2450     2979670 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2451             : 
    2452             : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
    2453             :             _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2454             :                              _mm256_cvtepi32_pd(xmm0));
    2455             :             _mm256_storeu_pd(
    2456             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2457             :                 _mm256_cvtepi32_pd(xmm1));
    2458             :             _mm256_storeu_pd(
    2459             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2460             :                 _mm256_cvtepi32_pd(xmm2));
    2461             :             _mm256_storeu_pd(
    2462             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2463             :                 _mm256_cvtepi32_pd(xmm3));
    2464             : #else
    2465     2979670 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2466     2979670 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2467     2979670 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2468     2979670 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2469     2979670 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2470     2979670 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2471     2979670 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2472     2979670 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2473     2979670 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2474     2979670 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2475     2979670 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2476     2979670 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2477             : 
    2478     2979670 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2479             :                           xmm0_low_d);
    2480             :             _mm_storeu_pd(
    2481     2979670 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2482             :                 xmm0_high_d);
    2483             :             _mm_storeu_pd(
    2484     2979670 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2485             :                 xmm1_low_d);
    2486             :             _mm_storeu_pd(
    2487     2979670 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2488             :                 xmm1_high_d);
    2489             :             _mm_storeu_pd(
    2490     2979670 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2491             :                 xmm2_low_d);
    2492             :             _mm_storeu_pd(
    2493     2979670 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2494             :                 xmm2_high_d);
    2495             :             _mm_storeu_pd(
    2496     2979670 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2497             :                 xmm3_low_d);
    2498             :             _mm_storeu_pd(
    2499     2979670 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2500             :                 xmm3_high_d);
    2501             : #endif
    2502             :         }
    2503      278002 :         for (; n < nWordCount; n++)
    2504             :         {
    2505      131496 :             pDstData[n] = pSrcData[n];
    2506      146506 :         }
    2507             :     }
    2508             :     else
    2509             :     {
    2510       23464 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2511             :                               nDstPixelStride, nWordCount);
    2512             :     }
    2513      169970 : }
    2514             : 
    2515             : template <>
    2516         148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
    2517             :                                  int nSrcPixelStride,
    2518             :                                  int8_t *const CPL_RESTRICT pDstData,
    2519             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2520             : {
    2521         148 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2522             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2523             :     {
    2524         142 :         decltype(nWordCount) n = 0;
    2525         142 :         const __m128i xmm_127 = _mm_set1_epi8(127);
    2526         146 :         for (; n < nWordCount - 31; n += 32)
    2527             :         {
    2528           8 :             __m128i xmm0 = _mm_loadu_si128(
    2529           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2530           4 :             __m128i xmm1 = _mm_loadu_si128(
    2531           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2532           4 :             xmm0 = _mm_min_epu8(xmm0, xmm_127);
    2533           4 :             xmm1 = _mm_min_epu8(xmm1, xmm_127);
    2534           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2535           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2536             :                              xmm1);
    2537             :         }
    2538        2424 :         for (; n < nWordCount; n++)
    2539             :         {
    2540        2282 :             pDstData[n] =
    2541        2282 :                 pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
    2542         142 :         }
    2543             :     }
    2544             :     else
    2545             :     {
    2546           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2547             :                               nDstPixelStride, nWordCount);
    2548             :     }
    2549         148 : }
    2550             : 
    2551             : template <>
    2552          82 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
    2553             :                                  int nSrcPixelStride,
    2554             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2555             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2556             : {
    2557          82 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2558             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2559             :     {
    2560          56 :         decltype(nWordCount) n = 0;
    2561             : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
    2562          56 :         const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
    2563             : #endif
    2564         117 :         for (; n < nWordCount - 31; n += 32)
    2565             :         {
    2566         122 :             __m128i xmm0 = _mm_loadu_si128(
    2567          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2568          61 :             __m128i xmm1 = _mm_loadu_si128(
    2569          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2570             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2571             :             xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
    2572             :             xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
    2573             : #else
    2574          61 :             xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
    2575          61 :             xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
    2576          61 :             xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
    2577          61 :             xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
    2578          61 :             xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
    2579          61 :             xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
    2580             : #endif
    2581          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2582          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2583             :                              xmm1);
    2584             :         }
    2585         352 :         for (; n < nWordCount; n++)
    2586             :         {
    2587         296 :             pDstData[n] =
    2588         296 :                 pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
    2589          56 :         }
    2590             :     }
    2591             :     else
    2592             :     {
    2593          26 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2594             :                               nDstPixelStride, nWordCount);
    2595             :     }
    2596          82 : }
    2597             : 
    2598             : template <>
    2599        6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2600             :                                  int nSrcPixelStride,
    2601             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2602             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2603             : {
    2604        6037 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2605             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2606             :     {
    2607        5062 :         decltype(nWordCount) n = 0;
    2608             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2609             :         const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2610             : #else
    2611             :         // In SSE2, min_epu16 does not exist, so shift from
    2612             :         // UInt16 to SInt16 to be able to use min_epi16
    2613        5062 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2614        5062 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2615             : #endif
    2616       71888 :         for (; n < nWordCount - 15; n += 16)
    2617             :         {
    2618      133652 :             __m128i xmm0 = _mm_loadu_si128(
    2619       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2620       66826 :             __m128i xmm1 = _mm_loadu_si128(
    2621       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2622             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2623             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2624             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2625             : #else
    2626       66826 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2627       66826 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2628       66826 :             xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
    2629       66826 :             xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
    2630       66826 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2631       66826 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2632             : #endif
    2633       66826 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    2634       66826 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2635             :         }
    2636       16403 :         for (; n < nWordCount; n++)
    2637             :         {
    2638       11341 :             pDstData[n] =
    2639       11341 :                 pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
    2640        5062 :         }
    2641             :     }
    2642             :     else
    2643             :     {
    2644         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2645             :                               nDstPixelStride, nWordCount);
    2646             :     }
    2647        6037 : }
    2648             : 
    2649             : template <>
    2650          46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2651             :                                  int nSrcPixelStride,
    2652             :                                  int16_t *const CPL_RESTRICT pDstData,
    2653             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2654             : {
    2655          46 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2656             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2657             :     {
    2658          40 :         decltype(nWordCount) n = 0;
    2659             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2660             :         const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2661             : #else
    2662             :         // In SSE2, min_epu16 does not exist, so shift from
    2663             :         // UInt16 to SInt16 to be able to use min_epi16
    2664          40 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2665          40 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2666             : #endif
    2667         169 :         for (; n < nWordCount - 15; n += 16)
    2668             :         {
    2669         258 :             __m128i xmm0 = _mm_loadu_si128(
    2670         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2671         129 :             __m128i xmm1 = _mm_loadu_si128(
    2672         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2673             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2674             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2675             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2676             : #else
    2677         129 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2678         129 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2679         129 :             xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
    2680         129 :             xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
    2681         129 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2682         129 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2683             : #endif
    2684         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2685         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2686             :                              xmm1);
    2687             :         }
    2688         191 :         for (; n < nWordCount; n++)
    2689             :         {
    2690         282 :             pDstData[n] = pSrcData[n] >= 32767
    2691             :                               ? 32767
    2692         131 :                               : static_cast<int16_t>(pSrcData[n]);
    2693          40 :         }
    2694             :     }
    2695             :     else
    2696             :     {
    2697           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2698             :                               nDstPixelStride, nWordCount);
    2699             :     }
    2700          46 : }
    2701             : 
    2702             : template <>
    2703         135 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2704             :                                  int nSrcPixelStride,
    2705             :                                  uint16_t *const CPL_RESTRICT pDstData,
    2706             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2707             : {
    2708         135 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2709             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2710             :     {
    2711          92 :         decltype(nWordCount) n = 0;
    2712          92 :         const __m128i xmm_zero = _mm_setzero_si128();
    2713         277 :         for (; n < nWordCount - 15; n += 16)
    2714             :         {
    2715         370 :             __m128i xmm0 = _mm_loadu_si128(
    2716         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2717         185 :             __m128i xmm1 = _mm_loadu_si128(
    2718         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2719         185 :             xmm0 = _mm_max_epi16(xmm0, xmm_zero);
    2720         185 :             xmm1 = _mm_max_epi16(xmm1, xmm_zero);
    2721         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2722         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2723             :                              xmm1);
    2724             :         }
    2725         468 :         for (; n < nWordCount; n++)
    2726             :         {
    2727         376 :             pDstData[n] =
    2728         376 :                 pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
    2729          92 :         }
    2730             :     }
    2731             :     else
    2732             :     {
    2733          43 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2734             :                               nDstPixelStride, nWordCount);
    2735             :     }
    2736         135 : }
    2737             : 
    2738             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2739             : 
    2740             : template <>
    2741             : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
    2742             :                                  int nSrcPixelStride,
    2743             :                                  int32_t *const CPL_RESTRICT pDstData,
    2744             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2745             : {
    2746             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2747             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2748             :     {
    2749             :         decltype(nWordCount) n = 0;
    2750             :         const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
    2751             :         for (; n < nWordCount - 8; n += 7)
    2752             :         {
    2753             :             __m128i xmm0 = _mm_loadu_si128(
    2754             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2755             :             __m128i xmm1 = _mm_loadu_si128(
    2756             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2757             :             xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
    2758             :             xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
    2759             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2760             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2761             :                              xmm1);
    2762             :         }
    2763             :         for (; n < nWordCount; n++)
    2764             :         {
    2765             :             pDstData[n] = pSrcData[n] >= INT_MAX
    2766             :                               ? INT_MAX
    2767             :                               : static_cast<int32_t>(pSrcData[n]);
    2768             :         }
    2769             :     }
    2770             :     else
    2771             :     {
    2772             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2773             :                               nDstPixelStride, nWordCount);
    2774             :     }
    2775             : }
    2776             : 
    2777             : template <>
    2778             : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    2779             :                                  int nSrcPixelStride,
    2780             :                                  uint32_t *const CPL_RESTRICT pDstData,
    2781             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2782             : {
    2783             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2784             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2785             :     {
    2786             :         decltype(nWordCount) n = 0;
    2787             :         const __m128i xmm_zero = _mm_setzero_si128();
    2788             :         for (; n < nWordCount - 7; n += 8)
    2789             :         {
    2790             :             __m128i xmm0 = _mm_loadu_si128(
    2791             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2792             :             __m128i xmm1 = _mm_loadu_si128(
    2793             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2794             :             xmm0 = _mm_max_epi32(xmm0, xmm_zero);
    2795             :             xmm1 = _mm_max_epi32(xmm1, xmm_zero);
    2796             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2797             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2798             :                              xmm1);
    2799             :         }
    2800             :         for (; n < nWordCount; n++)
    2801             :         {
    2802             :             pDstData[n] =
    2803             :                 pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
    2804             :         }
    2805             :     }
    2806             :     else
    2807             :     {
    2808             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2809             :                               nDstPixelStride, nWordCount);
    2810             :     }
    2811             : }
    2812             : 
    2813             : #endif  // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2814             : 
    2815             : template <>
    2816         343 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2817             :                                  int nSrcPixelStride,
    2818             :                                  float *const CPL_RESTRICT pDstData,
    2819             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2820             : {
    2821         343 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2822             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2823             :     {
    2824         337 :         decltype(nWordCount) n = 0;
    2825         337 :         const __m128i xmm_zero = _mm_setzero_si128();
    2826         337 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2827             :             reinterpret_cast<GByte *>(pDstData);
    2828        1508 :         for (; n < nWordCount - 7; n += 8)
    2829             :         {
    2830        1171 :             __m128i xmm = _mm_loadu_si128(
    2831        1171 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2832        1171 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2833        1171 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2834        1171 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2835        1171 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2836        1171 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2837             :                           xmm0_f);
    2838             :             _mm_storeu_ps(
    2839        1171 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2840             :         }
    2841        1115 :         for (; n < nWordCount; n++)
    2842             :         {
    2843         778 :             pDstData[n] = pSrcData[n];
    2844         337 :         }
    2845             :     }
    2846             :     else
    2847             :     {
    2848           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2849             :                               nDstPixelStride, nWordCount);
    2850             :     }
    2851         343 : }
    2852             : 
    2853             : template <>
    2854     1073480 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2855             :                                  int nSrcPixelStride,
    2856             :                                  float *const CPL_RESTRICT pDstData,
    2857             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2858             : {
    2859     1073480 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2860             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2861             :     {
    2862       83580 :         decltype(nWordCount) n = 0;
    2863       83580 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2864             :             reinterpret_cast<GByte *>(pDstData);
    2865      565267 :         for (; n < nWordCount - 7; n += 8)
    2866             :         {
    2867      481687 :             __m128i xmm = _mm_loadu_si128(
    2868      481687 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2869      481687 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2870      481687 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2871      481687 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2872      481687 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2873      481687 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2874      481687 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2875             :                           xmm0_f);
    2876             :             _mm_storeu_ps(
    2877      481687 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2878             :         }
    2879      244181 :         for (; n < nWordCount; n++)
    2880             :         {
    2881      160601 :             pDstData[n] = pSrcData[n];
    2882       83580 :         }
    2883             :     }
    2884             :     else
    2885             :     {
    2886      989901 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2887             :                               nDstPixelStride, nWordCount);
    2888             :     }
    2889     1073480 : }
    2890             : 
    2891             : template <>
    2892         405 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2893             :                                  int nSrcPixelStride,
    2894             :                                  double *const CPL_RESTRICT pDstData,
    2895             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2896             : {
    2897         405 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2898             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2899             :     {
    2900         293 :         decltype(nWordCount) n = 0;
    2901         293 :         const __m128i xmm_zero = _mm_setzero_si128();
    2902         293 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2903             :             reinterpret_cast<GByte *>(pDstData);
    2904         809 :         for (; n < nWordCount - 7; n += 8)
    2905             :         {
    2906         516 :             __m128i xmm = _mm_loadu_si128(
    2907         516 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2908         516 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2909         516 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2910             : 
    2911         516 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2912         516 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2913         516 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2914         516 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2915         516 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2916         516 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2917             : 
    2918         516 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2919             :                           xmm0_low_d);
    2920             :             _mm_storeu_pd(
    2921         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2922             :                 xmm0_high_d);
    2923             :             _mm_storeu_pd(
    2924         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2925             :                 xmm1_low_d);
    2926             :             _mm_storeu_pd(
    2927         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2928             :                 xmm1_high_d);
    2929             :         }
    2930        1034 :         for (; n < nWordCount; n++)
    2931             :         {
    2932         741 :             pDstData[n] = pSrcData[n];
    2933         293 :         }
    2934             :     }
    2935             :     else
    2936             :     {
    2937         112 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2938             :                               nDstPixelStride, nWordCount);
    2939             :     }
    2940         405 : }
    2941             : 
    2942             : template <>
    2943     2760350 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2944             :                                  int nSrcPixelStride,
    2945             :                                  double *const CPL_RESTRICT pDstData,
    2946             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2947             : {
    2948     2760350 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2949             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2950             :     {
    2951       34660 :         decltype(nWordCount) n = 0;
    2952       34660 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2953             :             reinterpret_cast<GByte *>(pDstData);
    2954      401770 :         for (; n < nWordCount - 7; n += 8)
    2955             :         {
    2956      367110 :             __m128i xmm = _mm_loadu_si128(
    2957      367110 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2958      367110 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2959      367110 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2960      367110 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2961             : 
    2962      367110 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2963      367110 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2964      367110 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2965      367110 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2966      367110 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2967      367110 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2968             : 
    2969      367110 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2970             :                           xmm0_low_d);
    2971             :             _mm_storeu_pd(
    2972      367110 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2973             :                 xmm0_high_d);
    2974             :             _mm_storeu_pd(
    2975      367110 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2976             :                 xmm1_low_d);
    2977             :             _mm_storeu_pd(
    2978      367110 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2979             :                 xmm1_high_d);
    2980             :         }
    2981      253693 :         for (; n < nWordCount; n++)
    2982             :         {
    2983      219033 :             pDstData[n] = pSrcData[n];
    2984       34660 :         }
    2985             :     }
    2986             :     else
    2987             :     {
    2988     2725690 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2989             :                               nDstPixelStride, nWordCount);
    2990             :     }
    2991     2760350 : }
    2992             : 
    2993             : #endif  // HAVE_SSE2
    2994             : 
    2995             : template <>
    2996     4420700 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    2997             :                                  int nSrcPixelStride,
    2998             :                                  GByte *const CPL_RESTRICT pDstData,
    2999             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3000             : {
    3001     4420700 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3002             :                             nDstPixelStride, nWordCount);
    3003     4420700 : }
    3004             : 
    3005             : template <>
    3006       38235 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3007             :                                  int nSrcPixelStride,
    3008             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3009             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3010             : {
    3011       38235 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3012             :                             nDstPixelStride, nWordCount);
    3013       38235 : }
    3014             : 
    3015             : template <>
    3016       54830 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3017             :                                  int nSrcPixelStride,
    3018             :                                  double *const CPL_RESTRICT pDstData,
    3019             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3020             : {
    3021       54830 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3022             :                             nDstPixelStride, nWordCount);
    3023       54830 : }
    3024             : 
    3025             : template <>
    3026      122131 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3027             :                                  int nSrcPixelStride,
    3028             :                                  float *const CPL_RESTRICT pDstData,
    3029             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3030             : {
    3031      122131 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3032             :                             nDstPixelStride, nWordCount);
    3033      122131 : }
    3034             : 
    3035             : template <>
    3036         396 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3037             :                                  int nSrcPixelStride,
    3038             :                                  float *const CPL_RESTRICT pDstData,
    3039             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3040             : {
    3041         396 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3042             :                             nDstPixelStride, nWordCount);
    3043         396 : }
    3044             : 
    3045             : template <>
    3046         544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3047             :                                  int nSrcPixelStride,
    3048             :                                  double *const CPL_RESTRICT pDstData,
    3049             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3050             : {
    3051         544 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3052             :                             nDstPixelStride, nWordCount);
    3053         544 : }
    3054             : 
    3055             : template <>
    3056      318163 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3057             :                                  int nSrcPixelStride,
    3058             :                                  GByte *const CPL_RESTRICT pDstData,
    3059             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3060             : {
    3061      318163 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3062             :                             nDstPixelStride, nWordCount);
    3063      318163 : }
    3064             : 
    3065             : template <>
    3066          55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3067             :                                  int nSrcPixelStride,
    3068             :                                  GInt8 *const CPL_RESTRICT pDstData,
    3069             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3070             : {
    3071          55 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3072             :                             nDstPixelStride, nWordCount);
    3073          55 : }
    3074             : 
    3075             : template <>
    3076       15775 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3077             :                                  int nSrcPixelStride,
    3078             :                                  GInt16 *const CPL_RESTRICT pDstData,
    3079             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3080             : {
    3081       15775 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3082             :                             nDstPixelStride, nWordCount);
    3083       15775 : }
    3084             : 
    3085             : template <>
    3086       61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3087             :                                  int nSrcPixelStride,
    3088             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3089             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3090             : {
    3091       61713 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3092             :                             nDstPixelStride, nWordCount);
    3093       61713 : }
    3094             : 
    3095             : template <>
    3096       43884 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3097             :                                  int nSrcPixelStride,
    3098             :                                  GInt32 *const CPL_RESTRICT pDstData,
    3099             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3100             : {
    3101       43884 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3102             :                             nDstPixelStride, nWordCount);
    3103       43884 : }
    3104             : 
    3105             : template <>
    3106          72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3107             :                                  int nSrcPixelStride,
    3108             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3109             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3110             : {
    3111          72 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3112             :                             nDstPixelStride, nWordCount);
    3113          72 : }
    3114             : 
    3115             : template <>
    3116          61 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3117             :                                  int nSrcPixelStride,
    3118             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3119             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3120             : {
    3121          61 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3122             :                             nDstPixelStride, nWordCount);
    3123          61 : }
    3124             : 
    3125             : /************************************************************************/
    3126             : /*                   GDALCopyWordsComplexT()                            */
    3127             : /************************************************************************/
    3128             : /**
    3129             :  * Template function, used to copy data from pSrcData into buffer
    3130             :  * pDstData, with stride nSrcPixelStride in the source data and
    3131             :  * stride nDstPixelStride in the destination data. Deals with the
    3132             :  * complex case, where input is complex and output is complex.
    3133             :  *
    3134             :  * @param pSrcData the source data buffer
    3135             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3136             :  *                      of interest.
    3137             :  * @param pDstData the destination buffer.
    3138             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3139             :  *                      interest.
    3140             :  * @param nWordCount the total number of pixel words to copy
    3141             :  *
    3142             :  */
    3143             : template <class Tin, class Tout>
    3144       98631 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    3145             :                                   int nSrcPixelStride,
    3146             :                                   Tout *const CPL_RESTRICT pDstData,
    3147             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    3148             : {
    3149       98631 :     decltype(nWordCount) nDstOffset = 0;
    3150       98631 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3151       98631 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3152             : 
    3153     5630497 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3154             :     {
    3155     5531861 :         const Tin *const pPixelIn =
    3156     5531861 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3157     5531861 :         Tout *const pPixelOut =
    3158     5531861 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3159             : 
    3160     5531861 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    3161     5531861 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    3162             : 
    3163     5531861 :         nDstOffset += nDstPixelStride;
    3164             :     }
    3165       98631 : }
    3166             : 
    3167             : /************************************************************************/
    3168             : /*                   GDALCopyWordsComplexOutT()                         */
    3169             : /************************************************************************/
    3170             : /**
    3171             :  * Template function, used to copy data from pSrcData into buffer
    3172             :  * pDstData, with stride nSrcPixelStride in the source data and
    3173             :  * stride nDstPixelStride in the destination data. Deals with the
    3174             :  * case where the value is real coming in, but complex going out.
    3175             :  *
    3176             :  * @param pSrcData the source data buffer
    3177             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3178             :  *                      of interest, in bytes.
    3179             :  * @param pDstData the destination buffer.
    3180             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3181             :  *                      interest, in bytes.
    3182             :  * @param nWordCount the total number of pixel words to copy
    3183             :  *
    3184             :  */
    3185             : template <class Tin, class Tout>
    3186        4394 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    3187             :                                      int nSrcPixelStride,
    3188             :                                      Tout *const CPL_RESTRICT pDstData,
    3189             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    3190             : {
    3191        4394 :     decltype(nWordCount) nDstOffset = 0;
    3192             : 
    3193        4394 :     const Tout tOutZero = static_cast<Tout>(0);
    3194             : 
    3195        4394 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3196        4394 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3197             : 
    3198     1188704 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3199             :     {
    3200     1184310 :         const Tin tValue =
    3201     1184310 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3202     1184310 :         Tout *const pPixelOut =
    3203     1184310 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3204     1184310 :         GDALCopyWord(tValue, *pPixelOut);
    3205             : 
    3206     1184310 :         pPixelOut[1] = tOutZero;
    3207             : 
    3208     1184310 :         nDstOffset += nDstPixelStride;
    3209             :     }
    3210        4394 : }
    3211             : 
    3212             : /************************************************************************/
    3213             : /*                           GDALCopyWordsFromT()                       */
    3214             : /************************************************************************/
    3215             : /**
    3216             :  * Template driver function. Given the input type T, call the appropriate
    3217             :  * GDALCopyWordsT function template for the desired output type. You should
    3218             :  * never call this function directly (call GDALCopyWords instead).
    3219             :  *
    3220             :  * @param pSrcData source data buffer
    3221             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    3222             :  * @param bInComplex input is complex
    3223             :  * @param pDstData destination data buffer
    3224             :  * @param eDstType destination data type
    3225             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    3226             :  * @param nWordCount number of pixel words to be copied
    3227             :  */
    3228             : template <class T>
    3229    54346973 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    3230             :                                int nSrcPixelStride, bool bInComplex,
    3231             :                                void *CPL_RESTRICT pDstData,
    3232             :                                GDALDataType eDstType, int nDstPixelStride,
    3233             :                                GPtrDiff_t nWordCount)
    3234             : {
    3235    54346973 :     switch (eDstType)
    3236             :     {
    3237     4783844 :         case GDT_UInt8:
    3238     4783844 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3239             :                            static_cast<unsigned char *>(pDstData),
    3240             :                            nDstPixelStride, nWordCount);
    3241     4783844 :             break;
    3242         753 :         case GDT_Int8:
    3243         753 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3244             :                            static_cast<signed char *>(pDstData),
    3245             :                            nDstPixelStride, nWordCount);
    3246         753 :             break;
    3247      140646 :         case GDT_UInt16:
    3248      140646 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3249             :                            static_cast<unsigned short *>(pDstData),
    3250             :                            nDstPixelStride, nWordCount);
    3251      140646 :             break;
    3252     4162591 :         case GDT_Int16:
    3253     4162591 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3254             :                            static_cast<short *>(pDstData), nDstPixelStride,
    3255             :                            nWordCount);
    3256     4162591 :             break;
    3257       22554 :         case GDT_UInt32:
    3258       22554 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3259             :                            static_cast<unsigned int *>(pDstData),
    3260             :                            nDstPixelStride, nWordCount);
    3261       22554 :             break;
    3262    26066731 :         case GDT_Int32:
    3263    26066731 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3264             :                            static_cast<int *>(pDstData), nDstPixelStride,
    3265             :                            nWordCount);
    3266    26066731 :             break;
    3267        1110 :         case GDT_UInt64:
    3268        1110 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3269             :                            static_cast<std::uint64_t *>(pDstData),
    3270             :                            nDstPixelStride, nWordCount);
    3271        1110 :             break;
    3272        5754 :         case GDT_Int64:
    3273        5754 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3274             :                            static_cast<std::int64_t *>(pDstData),
    3275             :                            nDstPixelStride, nWordCount);
    3276        5754 :             break;
    3277         997 :         case GDT_Float16:
    3278         997 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3279             :                            static_cast<GFloat16 *>(pDstData), nDstPixelStride,
    3280             :                            nWordCount);
    3281         997 :             break;
    3282     3836699 :         case GDT_Float32:
    3283     3836699 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3284             :                            static_cast<float *>(pDstData), nDstPixelStride,
    3285             :                            nWordCount);
    3286     3836699 :             break;
    3287    15222308 :         case GDT_Float64:
    3288    15222308 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3289             :                            static_cast<double *>(pDstData), nDstPixelStride,
    3290             :                            nWordCount);
    3291    15222308 :             break;
    3292       94424 :         case GDT_CInt16:
    3293       94424 :             if (bInComplex)
    3294             :             {
    3295       93170 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3296             :                                       static_cast<short *>(pDstData),
    3297             :                                       nDstPixelStride, nWordCount);
    3298             :             }
    3299             :             else  // input is not complex, so we need to promote to a complex
    3300             :                   // buffer
    3301             :             {
    3302        1254 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3303             :                                          static_cast<short *>(pDstData),
    3304             :                                          nDstPixelStride, nWordCount);
    3305             :             }
    3306       94424 :             break;
    3307        1349 :         case GDT_CInt32:
    3308        1349 :             if (bInComplex)
    3309             :             {
    3310         717 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3311             :                                       static_cast<int *>(pDstData),
    3312             :                                       nDstPixelStride, nWordCount);
    3313             :             }
    3314             :             else  // input is not complex, so we need to promote to a complex
    3315             :                   // buffer
    3316             :             {
    3317         632 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3318             :                                          static_cast<int *>(pDstData),
    3319             :                                          nDstPixelStride, nWordCount);
    3320             :             }
    3321        1349 :             break;
    3322         313 :         case GDT_CFloat16:
    3323         313 :             if (bInComplex)
    3324             :             {
    3325          48 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3326             :                                       static_cast<GFloat16 *>(pDstData),
    3327             :                                       nDstPixelStride, nWordCount);
    3328             :             }
    3329             :             else  // input is not complex, so we need to promote to a complex
    3330             :                   // buffer
    3331             :             {
    3332         265 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3333             :                                          static_cast<GFloat16 *>(pDstData),
    3334             :                                          nDstPixelStride, nWordCount);
    3335             :             }
    3336         313 :             break;
    3337        3791 :         case GDT_CFloat32:
    3338        3791 :             if (bInComplex)
    3339             :             {
    3340        2994 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3341             :                                       static_cast<float *>(pDstData),
    3342             :                                       nDstPixelStride, nWordCount);
    3343             :             }
    3344             :             else  // input is not complex, so we need to promote to a complex
    3345             :                   // buffer
    3346             :             {
    3347         797 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3348             :                                          static_cast<float *>(pDstData),
    3349             :                                          nDstPixelStride, nWordCount);
    3350             :             }
    3351        3791 :             break;
    3352        3148 :         case GDT_CFloat64:
    3353        3148 :             if (bInComplex)
    3354             :             {
    3355        1702 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3356             :                                       static_cast<double *>(pDstData),
    3357             :                                       nDstPixelStride, nWordCount);
    3358             :             }
    3359             :             else  // input is not complex, so we need to promote to a complex
    3360             :                   // buffer
    3361             :             {
    3362        1446 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3363             :                                          static_cast<double *>(pDstData),
    3364             :                                          nDstPixelStride, nWordCount);
    3365             :             }
    3366        3148 :             break;
    3367           0 :         case GDT_Unknown:
    3368             :         case GDT_TypeCount:
    3369           0 :             CPLAssert(false);
    3370             :     }
    3371    54346973 : }
    3372             : 
    3373             : }  // end anonymous namespace
    3374             : 
    3375             : /************************************************************************/
    3376             : /*                          GDALReplicateWord()                         */
    3377             : /************************************************************************/
    3378             : 
    3379             : template <class T>
    3380      598683 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    3381             :                                GPtrDiff_t nWordCount)
    3382             : {
    3383      598683 :     const T valSet = *static_cast<const T *>(pDstData);
    3384      598683 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    3385             :     {
    3386      568932 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    3387    31900103 :         while (nWordCount >= 4)
    3388             :         {
    3389    31331168 :             nWordCount -= 4;
    3390    31331168 :             pDstPtr[0] = valSet;
    3391    31331168 :             pDstPtr[1] = valSet;
    3392    31331168 :             pDstPtr[2] = valSet;
    3393    31331168 :             pDstPtr[3] = valSet;
    3394    31331168 :             pDstPtr += 4;
    3395             :         }
    3396     1470437 :         while (nWordCount > 0)
    3397             :         {
    3398      901505 :             --nWordCount;
    3399      901505 :             *pDstPtr = valSet;
    3400      901505 :             pDstPtr++;
    3401             :         }
    3402             :     }
    3403             :     else
    3404             :     {
    3405       29751 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    3406     1040338 :         while (nWordCount > 0)
    3407             :         {
    3408     1010587 :             --nWordCount;
    3409     1010587 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    3410     1010587 :             pabyDstPtr += nDstPixelStride;
    3411             :         }
    3412             :     }
    3413      598683 : }
    3414             : 
    3415     1050480 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    3416             :                               GDALDataType eSrcType,
    3417             :                               void *CPL_RESTRICT pDstData,
    3418             :                               GDALDataType eDstType, int nDstPixelStride,
    3419             :                               GPtrDiff_t nWordCount)
    3420             : {
    3421             :     /* -----------------------------------------------------------------------
    3422             :      */
    3423             :     /* Special case when the source data is always the same value */
    3424             :     /* (for VRTSourcedRasterBand::IRasterIO and
    3425             :      * VRTDerivedRasterBand::IRasterIO*/
    3426             :     /*  for example) */
    3427             :     /* -----------------------------------------------------------------------
    3428             :      */
    3429             :     // Let the general translation case do the necessary conversions
    3430             :     // on the first destination element.
    3431     1050480 :     GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    3432             : 
    3433             :     // Now copy the first element to the nWordCount - 1 following destination
    3434             :     // elements.
    3435     1050480 :     nWordCount--;
    3436     1050480 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    3437             : 
    3438     1050480 :     switch (eDstType)
    3439             :     {
    3440      451704 :         case GDT_UInt8:
    3441             :         case GDT_Int8:
    3442             :         {
    3443      451704 :             if (nDstPixelStride == 1)
    3444             :             {
    3445      380124 :                 if (nWordCount > 0)
    3446      380124 :                     memset(pabyDstWord,
    3447      380124 :                            *reinterpret_cast<const GByte *>(pDstData),
    3448             :                            nWordCount);
    3449             :             }
    3450             :             else
    3451             :             {
    3452       71580 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    3453    54467500 :                 while (nWordCount > 0)
    3454             :                 {
    3455    54395900 :                     --nWordCount;
    3456    54395900 :                     *pabyDstWord = valSet;
    3457    54395900 :                     pabyDstWord += nDstPixelStride;
    3458             :                 }
    3459             :             }
    3460      451704 :             break;
    3461             :         }
    3462             : 
    3463             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    3464             :     case enum_type:                                                            \
    3465             :     {                                                                          \
    3466             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    3467             :         break;                                                                 \
    3468             :     }
    3469             : 
    3470       34507 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    3471      202447 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    3472          56 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3473      300555 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3474          23 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3475        1066 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3476           0 :             CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
    3477       52668 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3478        7361 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3479             : 
    3480             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3481             :     case enum_type:                                                            \
    3482             :     {                                                                          \
    3483             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3484             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3485             :         while (nWordCount > 0)                                                 \
    3486             :         {                                                                      \
    3487             :             --nWordCount;                                                      \
    3488             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3489             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3490             :             pabyDstWord += nDstPixelStride;                                    \
    3491             :         }                                                                      \
    3492             :         break;                                                                 \
    3493             :     }
    3494             : 
    3495         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3496         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3497           6 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
    3498         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3499         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3500             : 
    3501           0 :         case GDT_Unknown:
    3502             :         case GDT_TypeCount:
    3503           0 :             CPLAssert(false);
    3504             :     }
    3505     1050480 : }
    3506             : 
    3507             : /************************************************************************/
    3508             : /*                        GDALUnrolledCopy()                            */
    3509             : /************************************************************************/
    3510             : 
    3511             : template <class T, int srcStride, int dstStride>
    3512             : #if defined(__GNUC__) && defined(__AVX2__)
    3513             : __attribute__((optimize("tree-vectorize")))
    3514             : #endif
    3515             : static inline void
    3516     3033265 : GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest, const T *CPL_RESTRICT pSrc,
    3517             :                         GPtrDiff_t nIters)
    3518             : {
    3519             : #if !(defined(__GNUC__) && defined(__AVX2__))
    3520     3033265 :     if (nIters >= 16)
    3521             :     {
    3522   133236907 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3523             :         {
    3524   130324255 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3525   130324255 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3526   130324255 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3527   130324255 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3528   130324255 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3529   130324255 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3530   130324255 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3531   130324255 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3532   130324255 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3533   130324255 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3534   130324255 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3535   130324255 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3536   130324255 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3537   130324255 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3538   130324255 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3539   130324255 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3540   130324255 :             pDest += 16 * dstStride;
    3541   130324255 :             pSrc += 16 * srcStride;
    3542             :         }
    3543     2912698 :         nIters = nIters % 16;
    3544             :     }
    3545             : #else
    3546             : #pragma GCC unroll 4
    3547             : #endif
    3548     5181291 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3549             :     {
    3550     2148037 :         pDest[i * dstStride] = *pSrc;
    3551     2148037 :         pSrc += srcStride;
    3552             :     }
    3553     3033265 : }
    3554             : 
    3555             : template <class T, int srcStride, int dstStride>
    3556     3033265 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3557             :                                     const T *CPL_RESTRICT pSrc,
    3558             :                                     GPtrDiff_t nIters)
    3559             : {
    3560     3033265 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3561     3033265 : }
    3562             : 
    3563             : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                \
    3564             :     (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
    3565             : 
    3566             : template <>
    3567             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3568             :                                    const GByte *CPL_RESTRICT pSrc,
    3569             :                                    GPtrDiff_t nIters)
    3570             : {
    3571             :     if (nIters > 16)
    3572             :     {
    3573             :         // The SSSE3 variant is slightly faster than what the gcc autovectorizer
    3574             :         // generates
    3575             :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3576             :     }
    3577             :     else
    3578             :     {
    3579             :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3580             :         {
    3581             :             pDest[i] = *pSrc;
    3582             :             pSrc += 3;
    3583             :         }
    3584             :     }
    3585             : }
    3586             : 
    3587             : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
    3588             : 
    3589             : template <>
    3590      354460 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3591             :                                    const GByte *CPL_RESTRICT pSrc,
    3592             :                                    GPtrDiff_t nIters)
    3593             : {
    3594      354460 :     decltype(nIters) i = 0;
    3595      354460 :     if (nIters > 16)
    3596             :     {
    3597      196203 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3598             :         // If we were sure that there would always be 1 trailing byte, we could
    3599             :         // check against nIters - 15
    3600     3012690 :         for (; i < nIters - 16; i += 16)
    3601             :         {
    3602             :             __m128i xmm0 =
    3603     2816480 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3604             :             __m128i xmm1 =
    3605     5632970 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3606             :             // Set higher 8bit of each int16 packed word to 0
    3607     2816480 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3608     2816480 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3609             :             // Pack int16 to uint8 and merge back both vector
    3610     2816480 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3611             : 
    3612             :             // Store result
    3613     2816480 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3614             : 
    3615     2816480 :             pSrc += 2 * 16;
    3616             :         }
    3617             :     }
    3618     4646080 :     for (; i < nIters; i++)
    3619             :     {
    3620     4291620 :         pDest[i] = *pSrc;
    3621     4291620 :         pSrc += 2;
    3622             :     }
    3623      354460 : }
    3624             : 
    3625           1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
    3626             :                                             const GByte *CPL_RESTRICT pSrc,
    3627             :                                             GPtrDiff_t nIters)
    3628             : {
    3629           1 :     decltype(nIters) i = 0;
    3630           1 :     const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
    3631             :     // If we were sure that there would always be 2 trailing bytes, we could
    3632             :     // check against nIters - 15
    3633           2 :     for (; i < nIters - 16; i += 16)
    3634             :     {
    3635             :         __m128i xmm0 =
    3636           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3637             :         __m128i xmm1 =
    3638           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3639             :         __m128i xmm2 =
    3640           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3641             : 
    3642           1 :         auto xmm_mask0 = xmm_mask_ori;
    3643           1 :         auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
    3644           1 :         auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
    3645             : 
    3646           1 :         auto xmm = _mm_and_si128(xmm0, xmm_mask0);
    3647           1 :         auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
    3648             : 
    3649           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3650           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3651           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3652           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3653           2 :         xmm_res1 = _mm_or_si128(
    3654             :             xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
    3655             : 
    3656           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3657           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3658           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3659           2 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3660           1 :         xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
    3661             : 
    3662           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3663           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3664           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3665           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3666           2 :         xmm_res1 = _mm_or_si128(
    3667             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
    3668             : 
    3669           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3670           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3671           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3672           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3673           3 :         xmm_res1 = _mm_or_si128(
    3674             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
    3675           1 :         xmm = _mm_or_si128(xmm, xmm_res1);
    3676             : 
    3677           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3678           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3679           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3680             : 
    3681           2 :         xmm = _mm_or_si128(xmm,
    3682             :                            _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
    3683             : 
    3684           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3685           2 :         xmm = _mm_or_si128(xmm,
    3686             :                            _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
    3687             : 
    3688           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3689           2 :         xmm = _mm_or_si128(xmm,
    3690             :                            _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
    3691             : 
    3692           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3693           2 :         xmm = _mm_or_si128(xmm,
    3694             :                            _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
    3695             : 
    3696           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3697           2 :         xmm = _mm_or_si128(xmm,
    3698             :                            _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
    3699             : 
    3700           1 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
    3701             : 
    3702           1 :         pSrc += 3 * 16;
    3703             :     }
    3704           2 :     for (; i < nIters; i++)
    3705             :     {
    3706           1 :         pDest[i] = *pSrc;
    3707           1 :         pSrc += 3;
    3708             :     }
    3709           1 : }
    3710             : 
    3711             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3712             : 
    3713             : template <>
    3714      192064 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3715             :                                    const GByte *CPL_RESTRICT pSrc,
    3716             :                                    GPtrDiff_t nIters)
    3717             : {
    3718      192064 :     if (nIters > 16)
    3719             :     {
    3720      185965 :         if (CPLHaveRuntimeSSSE3())
    3721             :         {
    3722      185964 :             GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3723             :         }
    3724             :         else
    3725             :         {
    3726           1 :             GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    3727             :         }
    3728             :     }
    3729             :     else
    3730             :     {
    3731       20168 :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3732             :         {
    3733       14069 :             pDest[i] = *pSrc;
    3734       14069 :             pSrc += 3;
    3735             :         }
    3736             :     }
    3737      192064 : }
    3738             : 
    3739             : #else
    3740             : 
    3741             : template <>
    3742             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3743             :                                    const GByte *CPL_RESTRICT pSrc,
    3744             :                                    GPtrDiff_t nIters)
    3745             : {
    3746             :     GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    3747             : }
    3748             : #endif
    3749             : 
    3750             : template <>
    3751      106698 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    3752             :                                    const GByte *CPL_RESTRICT pSrc,
    3753             :                                    GPtrDiff_t nIters)
    3754             : {
    3755      106698 :     decltype(nIters) i = 0;
    3756      106698 :     if (nIters > 16)
    3757             :     {
    3758      101405 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    3759             :         // If we were sure that there would always be 3 trailing bytes, we could
    3760             :         // check against nIters - 15
    3761    11580500 :         for (; i < nIters - 16; i += 16)
    3762             :         {
    3763             :             __m128i xmm0 =
    3764    11479100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3765             :             __m128i xmm1 =
    3766    11479100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3767             :             __m128i xmm2 =
    3768    11479100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3769             :             __m128i xmm3 =
    3770    22958200 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    3771             :             // Set higher 24bit of each int32 packed word to 0
    3772    11479100 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3773    11479100 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3774    11479100 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    3775    11479100 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    3776             :             // Pack int32 to int16
    3777    11479100 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    3778    11479100 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    3779             :             // Pack int16 to uint8
    3780    11479100 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    3781             : 
    3782             :             // Store result
    3783    11479100 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3784             : 
    3785    11479100 :             pSrc += 4 * 16;
    3786             :         }
    3787             :     }
    3788     1143150 :     for (; i < nIters; i++)
    3789             :     {
    3790     1036450 :         pDest[i] = *pSrc;
    3791     1036450 :         pSrc += 4;
    3792             :     }
    3793      106698 : }
    3794             : #endif  // HAVE_SSE2
    3795             : 
    3796             : /************************************************************************/
    3797             : /*                         GDALFastCopy()                               */
    3798             : /************************************************************************/
    3799             : 
    3800             : template <class T>
    3801    39777600 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    3802             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    3803             :                                 GPtrDiff_t nIters)
    3804             : {
    3805    39777600 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    3806    39777600 :     if (nIters == 1)
    3807             :     {
    3808    22297230 :         *pDest = *pSrc;
    3809             :     }
    3810    17480314 :     else if (nDestStride == sizeofT)
    3811             :     {
    3812    14373572 :         if (nSrcStride == sizeofT)
    3813             :         {
    3814    13513459 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    3815             :         }
    3816      860053 :         else if (nSrcStride == 2 * sizeofT)
    3817             :         {
    3818      357675 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    3819             :         }
    3820      502378 :         else if (nSrcStride == 3 * sizeofT)
    3821             :         {
    3822      288642 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    3823             :         }
    3824      213736 :         else if (nSrcStride == 4 * sizeofT)
    3825             :         {
    3826      110680 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    3827             :         }
    3828             :         else
    3829             :         {
    3830    17219290 :             while (nIters-- > 0)
    3831             :             {
    3832    17116250 :                 *pDest = *pSrc;
    3833    17116250 :                 pSrc += nSrcStride / sizeofT;
    3834    17116250 :                 pDest++;
    3835             :             }
    3836             :         }
    3837             :     }
    3838     3106802 :     else if (nSrcStride == sizeofT)
    3839             :     {
    3840     3093796 :         if (nDestStride == 2 * sizeofT)
    3841             :         {
    3842      150268 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    3843             :         }
    3844     2943525 :         else if (nDestStride == 3 * sizeofT)
    3845             :         {
    3846     2115801 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    3847             :         }
    3848      827733 :         else if (nDestStride == 4 * sizeofT)
    3849             :         {
    3850      663421 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    3851             :         }
    3852             :         else
    3853             :         {
    3854    17169660 :             while (nIters-- > 0)
    3855             :             {
    3856    17005410 :                 *pDest = *pSrc;
    3857    17005410 :                 pSrc++;
    3858    17005410 :                 pDest += nDestStride / sizeofT;
    3859             :             }
    3860             :         }
    3861             :     }
    3862             :     else
    3863             :     {
    3864     1220108 :         while (nIters-- > 0)
    3865             :         {
    3866     1207102 :             *pDest = *pSrc;
    3867     1207102 :             pSrc += nSrcStride / sizeofT;
    3868     1207102 :             pDest += nDestStride / sizeofT;
    3869             :         }
    3870             :     }
    3871    39777600 : }
    3872             : 
    3873             : /************************************************************************/
    3874             : /*                         GDALFastCopyByte()                           */
    3875             : /************************************************************************/
    3876             : 
    3877      326250 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    3878             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    3879             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    3880             : {
    3881      326250 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    3882             :                  nWordCount);
    3883      326250 : }
    3884             : 
    3885             : /************************************************************************/
    3886             : /*                           GDALCopyWords()                            */
    3887             : /************************************************************************/
    3888             : 
    3889             : /**
    3890             :  * Copy pixel words from buffer to buffer.
    3891             :  *
    3892             :  * @see GDALCopyWords64()
    3893             :  */
    3894    78067600 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    3895             :                                GDALDataType eSrcType, int nSrcPixelStride,
    3896             :                                void *CPL_RESTRICT pDstData,
    3897             :                                GDALDataType eDstType, int nDstPixelStride,
    3898             :                                int nWordCount)
    3899             : {
    3900    78067600 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    3901             :                     nDstPixelStride, nWordCount);
    3902    78067600 : }
    3903             : 
    3904             : /************************************************************************/
    3905             : /*                          GDALCopyWords64()                           */
    3906             : /************************************************************************/
    3907             : 
    3908             : /**
    3909             :  * Copy pixel words from buffer to buffer.
    3910             :  *
    3911             :  * This function is used to copy pixel word values from one memory buffer
    3912             :  * to another, with support for conversion between data types, and differing
    3913             :  * step factors. The data type conversion is done using the following
    3914             :  * rules:
    3915             :  * <ul>
    3916             :  * <li>Values assigned to a lower range integer type are clipped. For
    3917             :  * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
    3918             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    3919             :  * </li>
    3920             :  * <li>
    3921             :  * Assignment from floating point to integer rounds to closest integer.
    3922             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    3923             :  * smallest integer. NaN is mapped to 0.
    3924             :  * </li>
    3925             :  * <li>
    3926             :  * Assignment from non-complex to complex will result in the imaginary part
    3927             :  * being set to zero on output.
    3928             :  * </li>
    3929             :  * <li> Assignment from complex to
    3930             :  * non-complex will result in the complex portion being lost and the real
    3931             :  * component being preserved (<i>not magnitude!</i>).
    3932             :  * </li>
    3933             :  * </ul>
    3934             :  *
    3935             :  * No assumptions are made about the source or destination words occurring
    3936             :  * on word boundaries.  It is assumed that all values are in native machine
    3937             :  * byte order.
    3938             :  *
    3939             :  * @param pSrcData Pointer to source data to be converted.
    3940             :  * @param eSrcType the source data type (see GDALDataType enum)
    3941             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    3942             :  * in bytes
    3943             :  * @param pDstData Pointer to buffer where destination data should go
    3944             :  * @param eDstType the destination data type (see GDALDataType enum)
    3945             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    3946             :  * words), in bytes
    3947             :  * @param nWordCount number of words to be copied
    3948             :  *
    3949             :  * @note
    3950             :  * When adding a new data type to GDAL, you must do the following to
    3951             :  * support it properly within the GDALCopyWords function:
    3952             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    3953             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    3954             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    3955             :  *    This should call the appropriate GDALCopyWordsT template.
    3956             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    3957             :  *    above namespace. This will ensure that any conversion issues are
    3958             :  *    handled (cases like the float -> int32 case, where the min/max)
    3959             :  *    values are subject to roundoff error.
    3960             :  */
    3961             : 
    3962   108954000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    3963             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    3964             :                                  void *CPL_RESTRICT pDstData,
    3965             :                                  GDALDataType eDstType, int nDstPixelStride,
    3966             :                                  GPtrDiff_t nWordCount)
    3967             : 
    3968             : {
    3969             :     // On platforms where alignment matters, be careful
    3970   108954000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    3971   108954000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    3972   108954000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    3973             :     {
    3974           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    3975             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    3976             :                  "argument");
    3977           2 :         return;
    3978             :     }
    3979   108954000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    3980    59163100 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    3981    59163100 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    3982    59162700 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    3983    59162600 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    3984             :     {
    3985         905 :         if (eSrcType == eDstType)
    3986             :         {
    3987       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3988             :             {
    3989       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3990             :                        static_cast<const GByte *>(pSrcData) +
    3991       34000 :                            nSrcPixelStride * i,
    3992             :                        nDstDataTypeSize);
    3993             :             }
    3994             :         }
    3995             :         else
    3996             :         {
    3997         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    3998             :             {
    3999             :                 return ptr +
    4000         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    4001         210 :                         align);
    4002             :             };
    4003             : 
    4004             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    4005             :             // be sure to get correctly aligned pointer.
    4006         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    4007             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    4008             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    4009             :             GByte *pabySrcBuffer =
    4010         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    4011             :             GByte *pabyDstBuffer =
    4012         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    4013        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4014             :             {
    4015        3255 :                 memcpy(pabySrcBuffer,
    4016             :                        static_cast<const GByte *>(pSrcData) +
    4017        3255 :                            nSrcPixelStride * i,
    4018             :                        nSrcDataTypeSize);
    4019        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    4020             :                                 eDstType, 0, 1);
    4021        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4022             :                        pabyDstBuffer, nDstDataTypeSize);
    4023             :             }
    4024             :         }
    4025         905 :         return;
    4026             :     }
    4027             : 
    4028             :     // Deal with the case where we're replicating a single word into the
    4029             :     // provided buffer
    4030   108953000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    4031             :     {
    4032     1050480 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    4033             :                           nDstPixelStride, nWordCount);
    4034     1050480 :         return;
    4035             :     }
    4036             : 
    4037   107902000 :     if (eSrcType == eDstType)
    4038             :     {
    4039    53817400 :         if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
    4040             :         {
    4041    18000300 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    4042             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    4043             :                          nWordCount);
    4044    18000300 :             return;
    4045             :         }
    4046             : 
    4047    35817100 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    4048    21451000 :             (nDstPixelStride % 2) == 0)
    4049             :         {
    4050    21451000 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    4051             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    4052             :                          nWordCount);
    4053    21451000 :             return;
    4054             :         }
    4055             : 
    4056    14366100 :         if (nWordCount == 1)
    4057             :         {
    4058             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    4059             :             // Avoid false positives...
    4060             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    4061             : #else
    4062    13908600 :             if (nSrcDataTypeSize == 2)
    4063           0 :                 memcpy(pDstData, pSrcData, 2);
    4064    13908600 :             else if (nSrcDataTypeSize == 4)
    4065    13813500 :                 memcpy(pDstData, pSrcData, 4);
    4066       95125 :             else if (nSrcDataTypeSize == 8)
    4067       78520 :                 memcpy(pDstData, pSrcData, 8);
    4068             :             else /* if( eSrcType == GDT_CFloat64 ) */
    4069       16605 :                 memcpy(pDstData, pSrcData, 16);
    4070             : #endif
    4071    13908600 :             return;
    4072             :         }
    4073             : 
    4074             :         // Let memcpy() handle the case where we're copying a packed buffer
    4075             :         // of pixels.
    4076      457421 :         if (nSrcPixelStride == nDstPixelStride)
    4077             :         {
    4078      195607 :             if (nSrcPixelStride == nSrcDataTypeSize)
    4079             :             {
    4080      195539 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    4081      195539 :                 return;
    4082             :             }
    4083             :         }
    4084             :     }
    4085             : 
    4086             :     // Handle the more general case -- deals with conversion of data types
    4087             :     // directly.
    4088    54346900 :     switch (eSrcType)
    4089             :     {
    4090    15545100 :         case GDT_UInt8:
    4091    15545100 :             GDALCopyWordsFromT<unsigned char>(
    4092             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    4093             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4094    15545100 :             break;
    4095        1291 :         case GDT_Int8:
    4096        1291 :             GDALCopyWordsFromT<signed char>(
    4097             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    4098             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4099        1291 :             break;
    4100       54285 :         case GDT_UInt16:
    4101       54285 :             GDALCopyWordsFromT<unsigned short>(
    4102             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    4103             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4104       54285 :             break;
    4105     4353740 :         case GDT_Int16:
    4106     4353740 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4107             :                                       nSrcPixelStride, false, pDstData,
    4108             :                                       eDstType, nDstPixelStride, nWordCount);
    4109     4353740 :             break;
    4110        7432 :         case GDT_UInt32:
    4111        7432 :             GDALCopyWordsFromT<unsigned int>(
    4112             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    4113             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4114        7432 :             break;
    4115    12255400 :         case GDT_Int32:
    4116    12255400 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4117             :                                     nSrcPixelStride, false, pDstData, eDstType,
    4118             :                                     nDstPixelStride, nWordCount);
    4119    12255400 :             break;
    4120        1957 :         case GDT_UInt64:
    4121        1957 :             GDALCopyWordsFromT<std::uint64_t>(
    4122             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    4123             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4124        1957 :             break;
    4125       11578 :         case GDT_Int64:
    4126       11578 :             GDALCopyWordsFromT<std::int64_t>(
    4127             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    4128             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4129       11578 :             break;
    4130        1371 :         case GDT_Float16:
    4131        1371 :             GDALCopyWordsFromT<GFloat16>(
    4132             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
    4133             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4134        1371 :             break;
    4135      657732 :         case GDT_Float32:
    4136      657732 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4137             :                                       nSrcPixelStride, false, pDstData,
    4138             :                                       eDstType, nDstPixelStride, nWordCount);
    4139      657732 :             break;
    4140    20697400 :         case GDT_Float64:
    4141    20697400 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4142             :                                        nSrcPixelStride, false, pDstData,
    4143             :                                        eDstType, nDstPixelStride, nWordCount);
    4144    20697400 :             break;
    4145      478485 :         case GDT_CInt16:
    4146      478485 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4147             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4148             :                                       nDstPixelStride, nWordCount);
    4149      478485 :             break;
    4150         868 :         case GDT_CInt32:
    4151         868 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4152             :                                     nSrcPixelStride, true, pDstData, eDstType,
    4153             :                                     nDstPixelStride, nWordCount);
    4154         868 :             break;
    4155         508 :         case GDT_CFloat16:
    4156         508 :             GDALCopyWordsFromT<GFloat16>(
    4157             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
    4158             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4159         508 :             break;
    4160        2389 :         case GDT_CFloat32:
    4161        2389 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4162             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4163             :                                       nDstPixelStride, nWordCount);
    4164        2389 :             break;
    4165      277349 :         case GDT_CFloat64:
    4166      277349 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4167             :                                        nSrcPixelStride, true, pDstData,
    4168             :                                        eDstType, nDstPixelStride, nWordCount);
    4169      277349 :             break;
    4170           0 :         case GDT_Unknown:
    4171             :         case GDT_TypeCount:
    4172           0 :             CPLAssert(false);
    4173             :     }
    4174             : }
    4175             : 
    4176             : /************************************************************************/
    4177             : /*                            GDALCopyBits()                            */
    4178             : /************************************************************************/
    4179             : 
    4180             : /**
    4181             :  * Bitwise word copying.
    4182             :  *
    4183             :  * A function for moving sets of partial bytes around.  Loosely
    4184             :  * speaking this is a bitwise analog to GDALCopyWords().
    4185             :  *
    4186             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    4187             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    4188             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    4189             :  * and nDstOffset are the offset into the source and destination buffers
    4190             :  * to start at, also measured in bits.
    4191             :  *
    4192             :  * All bit offsets are assumed to start from the high order bit in a byte
    4193             :  * (i.e. most significant bit first).  Currently this function is not very
    4194             :  * optimized, but it may be improved for some common cases in the future
    4195             :  * as needed.
    4196             :  *
    4197             :  * @param pabySrcData the source data buffer.
    4198             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    4199             :  * first word to copy.
    4200             :  * @param nSrcStep the offset in bits from the start one source word to the
    4201             :  * start of the next.
    4202             :  * @param pabyDstData the destination data buffer.
    4203             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    4204             :  * first word to copy over.
    4205             :  * @param nDstStep the offset in bits from the start one word to the
    4206             :  * start of the next.
    4207             :  * @param nBitCount the number of bits in a word to be copied.
    4208             :  * @param nStepCount the number of words to copy.
    4209             :  */
    4210             : 
    4211           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    4212             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    4213             :                   int nBitCount, int nStepCount)
    4214             : 
    4215             : {
    4216           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    4217             : 
    4218           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    4219             :     {
    4220           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    4221             :         {
    4222           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    4223           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    4224             :             else
    4225           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    4226             : 
    4227           0 :             nSrcOffset++;
    4228           0 :             nDstOffset++;
    4229             :         }
    4230             : 
    4231           0 :         nSrcOffset += (nSrcStep - nBitCount);
    4232           0 :         nDstOffset += (nDstStep - nBitCount);
    4233             :     }
    4234             : }
    4235             : 
    4236             : /************************************************************************/
    4237             : /*                    GDALGetBestOverviewLevel()                        */
    4238             : /*                                                                      */
    4239             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4240             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    4241             : /* returning a valid overview level                                     */
    4242             : /************************************************************************/
    4243             : 
    4244           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    4245             :                                  int &nXSize, int &nYSize, int nBufXSize,
    4246             :                                  int nBufYSize)
    4247             : {
    4248           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    4249           0 :                                          nBufXSize, nBufYSize, nullptr);
    4250             : }
    4251             : 
    4252      523998 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    4253             :                                   int &nYOff, int &nXSize, int &nYSize,
    4254             :                                   int nBufXSize, int nBufYSize,
    4255             :                                   GDALRasterIOExtraArg *psExtraArg)
    4256             : {
    4257      523998 :     if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
    4258      523998 :         psExtraArg->bUseOnlyThisScale)
    4259         109 :         return -1;
    4260             :     /* -------------------------------------------------------------------- */
    4261             :     /*      Compute the desired downsampling factor.  It is                 */
    4262             :     /*      based on the least reduced axis, and represents the number      */
    4263             :     /*      of source pixels to one destination pixel.                      */
    4264             :     /* -------------------------------------------------------------------- */
    4265      523889 :     const double dfDesiredDownsamplingFactor =
    4266      523889 :         ((nXSize / static_cast<double>(nBufXSize)) <
    4267      361551 :              (nYSize / static_cast<double>(nBufYSize)) ||
    4268             :          nBufYSize == 1)
    4269      752276 :             ? nXSize / static_cast<double>(nBufXSize)
    4270      133164 :             : nYSize / static_cast<double>(nBufYSize);
    4271             : 
    4272             :     /* -------------------------------------------------------------------- */
    4273             :     /*      Find the overview level that largest downsampling factor (most  */
    4274             :     /*      downsampled) that is still less than (or only a little more)    */
    4275             :     /*      downsampled than the request.                                   */
    4276             :     /* -------------------------------------------------------------------- */
    4277      523889 :     const int nOverviewCount = poBand->GetOverviewCount();
    4278      523889 :     GDALRasterBand *poBestOverview = nullptr;
    4279      523889 :     double dfBestDownsamplingFactor = 0;
    4280      523889 :     int nBestOverviewLevel = -1;
    4281             : 
    4282             :     const char *pszOversampligThreshold =
    4283      523889 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    4284             : 
    4285             :     // Note: keep this logic for overview selection in sync between
    4286             :     // gdalwarp_lib.cpp and rasterio.cpp
    4287             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    4288             :     const double dfOversamplingThreshold =
    4289     1047770 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    4290      523880 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    4291     1047760 :             ? 1.0
    4292      523889 :             : 1.2;
    4293      526585 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4294             :     {
    4295        5612 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    4296       11224 :         if (poOverview == nullptr ||
    4297       11223 :             poOverview->GetXSize() > poBand->GetXSize() ||
    4298        5611 :             poOverview->GetYSize() > poBand->GetYSize())
    4299             :         {
    4300           1 :             continue;
    4301             :         }
    4302             : 
    4303             :         // Compute downsampling factor of this overview
    4304             :         const double dfDownsamplingFactor = std::min(
    4305        5611 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    4306       11222 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    4307             : 
    4308             :         // Is it nearly the requested factor and better (lower) than
    4309             :         // the current best factor?
    4310             :         // Use an epsilon because of numerical instability.
    4311        5611 :         constexpr double EPSILON = 1e-1;
    4312        5719 :         if (dfDownsamplingFactor >=
    4313        5611 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold +
    4314        5503 :                     EPSILON ||
    4315             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    4316             :         {
    4317         108 :             continue;
    4318             :         }
    4319             : 
    4320             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    4321        5503 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    4322             : 
    4323        5503 :         if (pszResampling != nullptr &&
    4324          71 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    4325          16 :             continue;
    4326             : 
    4327             :         // OK, this is our new best overview.
    4328        5487 :         poBestOverview = poOverview;
    4329        5487 :         nBestOverviewLevel = iOverview;
    4330        5487 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    4331             : 
    4332        5487 :         if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
    4333             :             EPSILON)
    4334             :         {
    4335        2916 :             break;
    4336             :         }
    4337             :     }
    4338             : 
    4339             :     /* -------------------------------------------------------------------- */
    4340             :     /*      If we didn't find an overview that helps us, just return        */
    4341             :     /*      indicating failure and the full resolution image will be used.  */
    4342             :     /* -------------------------------------------------------------------- */
    4343      523889 :     if (nBestOverviewLevel < 0)
    4344      520900 :         return -1;
    4345             : 
    4346             :     /* -------------------------------------------------------------------- */
    4347             :     /*      Recompute the source window in terms of the selected            */
    4348             :     /*      overview.                                                       */
    4349             :     /* -------------------------------------------------------------------- */
    4350             :     const double dfXFactor =
    4351        2989 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    4352             :     const double dfYFactor =
    4353        2989 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    4354        2989 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    4355             :              poBestOverview->GetYSize());
    4356             : 
    4357        8967 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    4358        2989 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    4359        8967 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    4360        2989 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    4361        2989 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    4362        2989 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    4363        2989 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    4364           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    4365        2989 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    4366           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    4367             : 
    4368        2989 :     if (psExtraArg)
    4369             :     {
    4370        2989 :         if (psExtraArg->bFloatingPointWindowValidity)
    4371             :         {
    4372         115 :             psExtraArg->dfXOff /= dfXFactor;
    4373         115 :             psExtraArg->dfXSize /= dfXFactor;
    4374         115 :             psExtraArg->dfYOff /= dfYFactor;
    4375         115 :             psExtraArg->dfYSize /= dfYFactor;
    4376             :         }
    4377        2874 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    4378             :         {
    4379          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    4380          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    4381          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    4382          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    4383          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    4384             :         }
    4385             :     }
    4386             : 
    4387        2989 :     nXOff = nOXOff;
    4388        2989 :     nYOff = nOYOff;
    4389        2989 :     nXSize = nOXSize;
    4390        2989 :     nYSize = nOYSize;
    4391             : 
    4392        2989 :     return nBestOverviewLevel;
    4393             : }
    4394             : 
    4395             : /************************************************************************/
    4396             : /*                          OverviewRasterIO()                          */
    4397             : /*                                                                      */
    4398             : /*      Special work function to utilize available overviews to         */
    4399             : /*      more efficiently satisfy downsampled requests.  It will         */
    4400             : /*      return CE_Failure if there are no appropriate overviews         */
    4401             : /*      available but it doesn't emit any error messages.               */
    4402             : /************************************************************************/
    4403             : 
    4404             : //! @cond Doxygen_Suppress
    4405           2 : CPLErr GDALRasterBand::OverviewRasterIO(
    4406             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4407             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4408             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    4409             : 
    4410             : {
    4411             :     GDALRasterIOExtraArg sExtraArg;
    4412           2 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4413             : 
    4414           2 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    4415             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    4416           2 :     if (nOverview < 0)
    4417           1 :         return CE_Failure;
    4418             : 
    4419             :     /* -------------------------------------------------------------------- */
    4420             :     /*      Recast the call in terms of the new raster layer.               */
    4421             :     /* -------------------------------------------------------------------- */
    4422           1 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    4423           1 :     if (poOverviewBand == nullptr)
    4424           0 :         return CE_Failure;
    4425             : 
    4426           1 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4427             :                                     pData, nBufXSize, nBufYSize, eBufType,
    4428           1 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    4429             : }
    4430             : 
    4431             : /************************************************************************/
    4432             : /*                      TryOverviewRasterIO()                           */
    4433             : /************************************************************************/
    4434             : 
    4435      362417 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    4436             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4437             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4438             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    4439             :     int *pbTried)
    4440             : {
    4441      362417 :     int nXOffMod = nXOff;
    4442      362417 :     int nYOffMod = nYOff;
    4443      362417 :     int nXSizeMod = nXSize;
    4444      362417 :     int nYSizeMod = nYSize;
    4445             :     GDALRasterIOExtraArg sExtraArg;
    4446             : 
    4447      362417 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4448             : 
    4449      362417 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4450             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    4451             :         &sExtraArg);
    4452             : 
    4453      362417 :     if (iOvrLevel >= 0)
    4454             :     {
    4455          50 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    4456          50 :         if (poOverviewBand)
    4457             :         {
    4458          50 :             *pbTried = TRUE;
    4459          50 :             return poOverviewBand->RasterIO(
    4460             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    4461             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    4462          50 :                 &sExtraArg);
    4463             :         }
    4464             :     }
    4465             : 
    4466      362367 :     *pbTried = FALSE;
    4467      362367 :     return CE_None;
    4468             : }
    4469             : 
    4470             : /************************************************************************/
    4471             : /*                      TryOverviewRasterIO()                           */
    4472             : /************************************************************************/
    4473             : 
    4474      158605 : CPLErr GDALDataset::TryOverviewRasterIO(
    4475             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4476             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4477             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4478             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    4479             :     int *pbTried)
    4480             : {
    4481      158605 :     int nXOffMod = nXOff;
    4482      158605 :     int nYOffMod = nYOff;
    4483      158605 :     int nXSizeMod = nXSize;
    4484      158605 :     int nYSizeMod = nYSize;
    4485             :     GDALRasterIOExtraArg sExtraArg;
    4486      158605 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4487             : 
    4488      317210 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4489      158605 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    4490             :         nBufYSize, &sExtraArg);
    4491             : 
    4492      158646 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    4493          41 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    4494             :     {
    4495          41 :         *pbTried = TRUE;
    4496          41 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    4497             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    4498             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    4499          41 :             nLineSpace, nBandSpace, &sExtraArg);
    4500             :     }
    4501             :     else
    4502             :     {
    4503      158564 :         *pbTried = FALSE;
    4504      158564 :         return CE_None;
    4505             :     }
    4506             : }
    4507             : 
    4508             : /************************************************************************/
    4509             : /*                        GetBestOverviewLevel()                        */
    4510             : /*                                                                      */
    4511             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4512             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    4513             : /* overview level                                                       */
    4514             : /************************************************************************/
    4515             : 
    4516           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    4517             :                                            int &nYOff, int &nXSize, int &nYSize,
    4518             :                                            int nBufXSize, int nBufYSize,
    4519             :                                            int nBandCount,
    4520             :                                            const int *panBandMap,
    4521             :                                            GDALRasterIOExtraArg *psExtraArg)
    4522             : {
    4523           4 :     int nOverviewCount = 0;
    4524           4 :     GDALRasterBand *poFirstBand = nullptr;
    4525             : 
    4526             :     /* -------------------------------------------------------------------- */
    4527             :     /* Check that all bands have the same number of overviews and           */
    4528             :     /* that they have all the same size and block dimensions                */
    4529             :     /* -------------------------------------------------------------------- */
    4530          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4531             :     {
    4532           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    4533           8 :         if (poBand == nullptr)
    4534           0 :             return -1;
    4535           8 :         if (iBand == 0)
    4536             :         {
    4537           4 :             poFirstBand = poBand;
    4538           4 :             nOverviewCount = poBand->GetOverviewCount();
    4539             :         }
    4540           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    4541             :         {
    4542           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4543             :                              "mismatched overview count, use std method.");
    4544           0 :             return -1;
    4545             :         }
    4546             :         else
    4547             :         {
    4548           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4549             :             {
    4550           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    4551             :                 GDALRasterBand *poOvrFirstBand =
    4552           0 :                     poFirstBand->GetOverview(iOverview);
    4553           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    4554           0 :                     continue;
    4555             : 
    4556           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    4557           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    4558             :                 {
    4559           0 :                     CPLDebug("GDAL",
    4560             :                              "GDALDataset::GetBestOverviewLevel() ... "
    4561             :                              "mismatched overview sizes, use std method.");
    4562           0 :                     return -1;
    4563             :                 }
    4564           0 :                 int nBlockXSizeFirst = 0;
    4565           0 :                 int nBlockYSizeFirst = 0;
    4566           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    4567             :                                              &nBlockYSizeFirst);
    4568             : 
    4569           0 :                 int nBlockXSizeCurrent = 0;
    4570           0 :                 int nBlockYSizeCurrent = 0;
    4571           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    4572             :                                         &nBlockYSizeCurrent);
    4573             : 
    4574           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    4575           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    4576             :                 {
    4577           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4578             :                                      "mismatched block sizes, use std method.");
    4579           0 :                     return -1;
    4580             :                 }
    4581             :             }
    4582             :         }
    4583             :     }
    4584           4 :     if (poFirstBand == nullptr)
    4585           0 :         return -1;
    4586             : 
    4587           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    4588             :                                          nYSize, nBufXSize, nBufYSize,
    4589           4 :                                          psExtraArg);
    4590             : }
    4591             : 
    4592             : /************************************************************************/
    4593             : /*                         BlockBasedRasterIO()                         */
    4594             : /*                                                                      */
    4595             : /*      This convenience function implements a dataset level            */
    4596             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    4597             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    4598             : /*      all bands at once, so that a format driver that handles a       */
    4599             : /*      request for different bands of the same block efficiently       */
    4600             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    4601             : /*                                                                      */
    4602             : /*      This method is intended to be called by an overridden           */
    4603             : /*      IRasterIO() method in the driver specific GDALDataset           */
    4604             : /*      derived class.                                                  */
    4605             : /*                                                                      */
    4606             : /*      Default internal implementation of RasterIO() ... utilizes      */
    4607             : /*      the Block access methods to satisfy the request.  This would    */
    4608             : /*      normally only be overridden by formats with overviews.          */
    4609             : /*                                                                      */
    4610             : /*      To keep things relatively simple, this method does not          */
    4611             : /*      currently take advantage of some special cases addressed in     */
    4612             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    4613             : /*      call it when you know it will help.  That is in cases where     */
    4614             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    4615             : /*      implementing interleaved IO efficiently on a block by block     */
    4616             : /*      basis. Overviews will be used when possible.                    */
    4617             : /************************************************************************/
    4618             : 
    4619       64164 : CPLErr GDALDataset::BlockBasedRasterIO(
    4620             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4621             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4622             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4623             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    4624             : 
    4625             : {
    4626       64164 :     CPLAssert(nullptr != pData);
    4627             : 
    4628       64164 :     GByte **papabySrcBlock = nullptr;
    4629       64164 :     GDALRasterBlock *poBlock = nullptr;
    4630       64164 :     GDALRasterBlock **papoBlocks = nullptr;
    4631       64164 :     int nLBlockX = -1;
    4632       64164 :     int nLBlockY = -1;
    4633             :     int iBufYOff;
    4634             :     int iBufXOff;
    4635       64164 :     int nBlockXSize = 1;
    4636       64164 :     int nBlockYSize = 1;
    4637       64164 :     CPLErr eErr = CE_None;
    4638       64164 :     GDALDataType eDataType = GDT_UInt8;
    4639             : 
    4640       64164 :     const bool bUseIntegerRequestCoords =
    4641       64194 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4642          30 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4643          28 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4644             : 
    4645             :     /* -------------------------------------------------------------------- */
    4646             :     /*      Ensure that all bands share a common block size and data type.  */
    4647             :     /* -------------------------------------------------------------------- */
    4648      304122 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4649             :     {
    4650      239958 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4651             : 
    4652      239958 :         if (iBand == 0)
    4653             :         {
    4654       64164 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4655       64164 :             eDataType = poBand->GetRasterDataType();
    4656             :         }
    4657             :         else
    4658             :         {
    4659      175794 :             int nThisBlockXSize = 0;
    4660      175794 :             int nThisBlockYSize = 0;
    4661      175794 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    4662      175794 :             if (nThisBlockXSize != nBlockXSize ||
    4663      175794 :                 nThisBlockYSize != nBlockYSize)
    4664             :             {
    4665           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4666             :                                  "mismatched block sizes, use std method.");
    4667           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4668             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4669             :                                          nBandCount, panBandMap, nPixelSpace,
    4670           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4671             :             }
    4672             : 
    4673      175794 :             if (eDataType != poBand->GetRasterDataType() &&
    4674           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    4675             :             {
    4676           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4677             :                                  "mismatched band data types, use std method.");
    4678           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4679             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4680             :                                          nBandCount, panBandMap, nPixelSpace,
    4681           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4682             :             }
    4683             :         }
    4684             :     }
    4685             : 
    4686             :     /* ==================================================================== */
    4687             :     /*      In this special case at full resolution we step through in      */
    4688             :     /*      blocks, turning the request over to the per-band                */
    4689             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4690             :     /*      called before proceeding to the next.                           */
    4691             :     /* ==================================================================== */
    4692             : 
    4693       64164 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4694             :     {
    4695             :         GDALRasterIOExtraArg sDummyExtraArg;
    4696       64160 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4697             : 
    4698       64160 :         int nChunkYSize = 0;
    4699       64160 :         int nChunkXSize = 0;
    4700             : 
    4701      210807 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4702             :         {
    4703      147664 :             const int nChunkYOff = iBufYOff + nYOff;
    4704      147664 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4705      147664 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4706       59196 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4707             : 
    4708      818665 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4709             :             {
    4710      672016 :                 const int nChunkXOff = iBufXOff + nXOff;
    4711      672016 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4712      672016 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4713       70395 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4714             : 
    4715      672016 :                 GByte *pabyChunkData =
    4716      672016 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4717      672016 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4718             : 
    4719     3271840 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4720             :                 {
    4721     2600840 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4722             : 
    4723     5201690 :                     eErr = poBand->IRasterIO(
    4724             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4725             :                         nChunkYSize,
    4726     2600840 :                         pabyChunkData +
    4727     2600840 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4728             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4729     2600840 :                         nLineSpace, &sDummyExtraArg);
    4730     2600840 :                     if (eErr != CE_None)
    4731        1015 :                         return eErr;
    4732             :                 }
    4733             :             }
    4734             : 
    4735      165480 :             if (psExtraArg->pfnProgress != nullptr &&
    4736       18831 :                 !psExtraArg->pfnProgress(
    4737      165480 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4738             :                         nBufYSize,
    4739             :                     "", psExtraArg->pProgressData))
    4740             :             {
    4741           2 :                 return CE_Failure;
    4742             :             }
    4743             :         }
    4744             : 
    4745       63143 :         return CE_None;
    4746             :     }
    4747             : 
    4748             :     /* Below code is not compatible with that case. It would need a complete */
    4749             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    4750           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    4751             :     {
    4752           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4753             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4754             :                                  panBandMap, nPixelSpace, nLineSpace,
    4755           0 :                                  nBandSpace, psExtraArg);
    4756             :     }
    4757             : 
    4758             :     /* We could have a smarter implementation, but that will do for now */
    4759           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    4760           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    4761             :     {
    4762           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4763             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4764             :                                  panBandMap, nPixelSpace, nLineSpace,
    4765           0 :                                  nBandSpace, psExtraArg);
    4766             :     }
    4767             : 
    4768             :     /* ==================================================================== */
    4769             :     /*      Loop reading required source blocks to satisfy output           */
    4770             :     /*      request.  This is the most general implementation.              */
    4771             :     /* ==================================================================== */
    4772             : 
    4773           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    4774             : 
    4775             :     papabySrcBlock =
    4776           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    4777             :     papoBlocks =
    4778           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    4779             : 
    4780             :     /* -------------------------------------------------------------------- */
    4781             :     /*      Select an overview level if appropriate.                        */
    4782             :     /* -------------------------------------------------------------------- */
    4783             : 
    4784             :     GDALRasterIOExtraArg sExtraArg;
    4785           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4786           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    4787             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    4788             :         panBandMap, &sExtraArg);
    4789           4 :     if (nOverviewLevel >= 0)
    4790             :     {
    4791           2 :         GetRasterBand(panBandMap[0])
    4792           2 :             ->GetOverview(nOverviewLevel)
    4793           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4794             :     }
    4795             : 
    4796           4 :     double dfXOff = nXOff;
    4797           4 :     double dfYOff = nYOff;
    4798           4 :     double dfXSize = nXSize;
    4799           4 :     double dfYSize = nYSize;
    4800           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    4801             :     {
    4802           2 :         dfXOff = sExtraArg.dfXOff;
    4803           2 :         dfYOff = sExtraArg.dfYOff;
    4804           2 :         dfXSize = sExtraArg.dfXSize;
    4805           2 :         dfYSize = sExtraArg.dfYSize;
    4806             :     }
    4807             : 
    4808             :     /* -------------------------------------------------------------------- */
    4809             :     /*      Compute stepping increment.                                     */
    4810             :     /* -------------------------------------------------------------------- */
    4811           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    4812           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    4813             : 
    4814           4 :     constexpr double EPS = 1e-10;
    4815             :     /* -------------------------------------------------------------------- */
    4816             :     /*      Loop over buffer computing source locations.                    */
    4817             :     /* -------------------------------------------------------------------- */
    4818          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    4819             :     {
    4820             :         GPtrDiff_t iSrcOffset;
    4821             : 
    4822             :         // Add small epsilon to avoid some numeric precision issues.
    4823          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    4824          32 :         const int iSrcY = static_cast<int>(std::min(
    4825          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    4826             : 
    4827          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    4828             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    4829             : 
    4830         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    4831             :         {
    4832         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    4833         270 :             const int iSrcX = static_cast<int>(std::min(
    4834         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    4835             : 
    4836             :             // FIXME: this code likely doesn't work if the dirty block gets
    4837             :             // flushed to disk before being completely written. In the meantime,
    4838             :             // bJustInitialize should probably be set to FALSE even if it is not
    4839             :             // ideal performance wise, and for lossy compression
    4840             : 
    4841             :             /* --------------------------------------------------------------------
    4842             :              */
    4843             :             /*      Ensure we have the appropriate block loaded. */
    4844             :             /* --------------------------------------------------------------------
    4845             :              */
    4846         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    4847         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    4848         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    4849         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    4850             :             {
    4851           4 :                 nLBlockX = iSrcX / nBlockXSize;
    4852           4 :                 nLBlockY = iSrcY / nBlockYSize;
    4853             : 
    4854           4 :                 const bool bJustInitialize =
    4855           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    4856           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    4857           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    4858           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    4859             :                 /*bool bMemZeroBuffer = FALSE;
    4860             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    4861             :                     nXOff <= nLBlockX * nBlockXSize &&
    4862             :                     nYOff <= nLBlockY * nBlockYSize &&
    4863             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    4864             :                      (nXOff + nXSize == GetRasterXSize() &&
    4865             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    4866             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    4867             :                      (nYOff + nYSize == GetRasterYSize() &&
    4868             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    4869             :                 {
    4870             :                     bJustInitialize = TRUE;
    4871             :                     bMemZeroBuffer = TRUE;
    4872             :                 }*/
    4873          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4874             :                 {
    4875           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4876           8 :                     if (nOverviewLevel >= 0)
    4877           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    4878          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    4879           8 :                                                         bJustInitialize);
    4880           8 :                     if (poBlock == nullptr)
    4881             :                     {
    4882           0 :                         eErr = CE_Failure;
    4883           0 :                         goto CleanupAndReturn;
    4884             :                     }
    4885             : 
    4886           8 :                     if (eRWFlag == GF_Write)
    4887           0 :                         poBlock->MarkDirty();
    4888             : 
    4889           8 :                     if (papoBlocks[iBand] != nullptr)
    4890           0 :                         papoBlocks[iBand]->DropLock();
    4891             : 
    4892           8 :                     papoBlocks[iBand] = poBlock;
    4893             : 
    4894           8 :                     papabySrcBlock[iBand] =
    4895           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    4896             :                     /*if( bMemZeroBuffer )
    4897             :                     {
    4898             :                         memset(papabySrcBlock[iBand], 0,
    4899             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    4900             :                     * nBlockYSize);
    4901             :                     }*/
    4902             :                 }
    4903             :             }
    4904             : 
    4905             :             /* --------------------------------------------------------------------
    4906             :              */
    4907             :             /*      Copy over this pixel of data. */
    4908             :             /* --------------------------------------------------------------------
    4909             :              */
    4910         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    4911         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    4912         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    4913         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    4914         270 :                               nBlockXSize) *
    4915         270 :                          nBandDataSize;
    4916             : 
    4917         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    4918             :             {
    4919         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    4920         710 :                 GPtrDiff_t iBandBufOffset =
    4921         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    4922             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    4923             : 
    4924         710 :                 if (eDataType == eBufType)
    4925             :                 {
    4926         710 :                     if (eRWFlag == GF_Read)
    4927         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    4928         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    4929             :                     else
    4930           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    4931             :                                static_cast<const GByte *>(pData) +
    4932           0 :                                    iBandBufOffset,
    4933             :                                nBandDataSize);
    4934             :                 }
    4935             :                 else
    4936             :                 {
    4937             :                     /* type to type conversion ... ouch, this is expensive way
    4938             :                        of handling single words */
    4939             : 
    4940           0 :                     if (eRWFlag == GF_Read)
    4941           0 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
    4942             :                                         static_cast<GByte *>(pData) +
    4943           0 :                                             iBandBufOffset,
    4944             :                                         eBufType, 0, 1);
    4945             :                     else
    4946           0 :                         GDALCopyWords64(static_cast<const GByte *>(pData) +
    4947           0 :                                             iBandBufOffset,
    4948           0 :                                         eBufType, 0, pabySrcBlock + iSrcOffset,
    4949             :                                         eDataType, 0, 1);
    4950             :                 }
    4951             :             }
    4952             : 
    4953         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    4954             :         }
    4955             :     }
    4956             : 
    4957             :     /* -------------------------------------------------------------------- */
    4958             :     /*      CleanupAndReturn.                                               */
    4959             :     /* -------------------------------------------------------------------- */
    4960           4 : CleanupAndReturn:
    4961           4 :     CPLFree(papabySrcBlock);
    4962           4 :     if (papoBlocks != nullptr)
    4963             :     {
    4964          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    4965             :         {
    4966           8 :             if (papoBlocks[iBand] != nullptr)
    4967           8 :                 papoBlocks[iBand]->DropLock();
    4968             :         }
    4969           4 :         CPLFree(papoBlocks);
    4970             :     }
    4971             : 
    4972           4 :     return eErr;
    4973             : }
    4974             : 
    4975             : //! @endcond
    4976             : 
    4977             : /************************************************************************/
    4978             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    4979             : /************************************************************************/
    4980             : 
    4981        3296 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    4982             :                                             GDALRasterBand *poDstPrototypeBand,
    4983             :                                             int nBandCount,
    4984             :                                             int bDstIsCompressed,
    4985             :                                             int bInterleave, int *pnSwathCols,
    4986             :                                             int *pnSwathLines)
    4987             : {
    4988        3296 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4989        3296 :     int nSrcBlockXSize = 0;
    4990        3296 :     int nSrcBlockYSize = 0;
    4991        3296 :     int nBlockXSize = 0;
    4992        3296 :     int nBlockYSize = 0;
    4993             : 
    4994        3296 :     int nXSize = poSrcPrototypeBand->GetXSize();
    4995        3296 :     int nYSize = poSrcPrototypeBand->GetYSize();
    4996             : 
    4997        3296 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    4998        3296 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4999             : 
    5000        3296 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    5001        3296 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    5002             : 
    5003        3296 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5004        3296 :     if (bInterleave)
    5005         556 :         nPixelSize *= nBandCount;
    5006             : 
    5007             :     // aim for one row of blocks.  Do not settle for less.
    5008        3296 :     int nSwathCols = nXSize;
    5009        3296 :     int nSwathLines = nMaxBlockYSize;
    5010             : 
    5011             :     const char *pszSrcCompression =
    5012        3296 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    5013        3296 :     if (pszSrcCompression == nullptr)
    5014             :     {
    5015        3270 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    5016        3270 :         if (poSrcDS)
    5017             :             pszSrcCompression =
    5018        3264 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    5019             :     }
    5020             : 
    5021             :     /* -------------------------------------------------------------------- */
    5022             :     /*      What will our swath size be?                                    */
    5023             :     /* -------------------------------------------------------------------- */
    5024             :     // When writing interleaved data in a compressed format, we want to be sure
    5025             :     // that each block will only be written once, so the swath size must not be
    5026             :     // greater than the block cache.
    5027        3296 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    5028             :     int nTargetSwathSize;
    5029        3296 :     if (pszSwathSize != nullptr)
    5030           0 :         nTargetSwathSize = static_cast<int>(
    5031           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    5032             :     else
    5033             :     {
    5034             :         // As a default, take one 1/4 of the cache size.
    5035        3296 :         nTargetSwathSize = static_cast<int>(
    5036        3296 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    5037             : 
    5038             :         // but if the minimum idal swath buf size is less, then go for it to
    5039             :         // avoid unnecessarily abusing RAM usage.
    5040             :         // but try to use 10 MB at least.
    5041        3296 :         GIntBig nIdealSwathBufSize =
    5042        3296 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    5043        3296 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    5044             : 
    5045        3296 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    5046        3296 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    5047             :         {
    5048           1 :             nMinTargetSwathSize = nTargetSwathSize;
    5049             :         }
    5050             : 
    5051        3296 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    5052        3286 :             nIdealSwathBufSize < nMinTargetSwathSize)
    5053             :         {
    5054        3283 :             nIdealSwathBufSize = nMinTargetSwathSize;
    5055             :         }
    5056             : 
    5057        3296 :         if (pszSrcCompression != nullptr &&
    5058         184 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    5059           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    5060           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    5061             :         {
    5062           2 :             nIdealSwathBufSize =
    5063           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    5064           2 :                                                  nSrcBlockYSize * nPixelSize);
    5065             :         }
    5066        3296 :         if (nTargetSwathSize > nIdealSwathBufSize)
    5067        3283 :             nTargetSwathSize = static_cast<int>(
    5068        3283 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    5069             :     }
    5070             : 
    5071        3296 :     if (nTargetSwathSize < 1000000)
    5072           8 :         nTargetSwathSize = 1000000;
    5073             : 
    5074             :     /* But let's check that  */
    5075        3517 :     if (bDstIsCompressed && bInterleave &&
    5076         221 :         nTargetSwathSize > GDALGetCacheMax64())
    5077             :     {
    5078           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    5079             :                  "When translating into a compressed interleave format, "
    5080             :                  "the block cache size (" CPL_FRMT_GIB ") "
    5081             :                  "should be at least the size of the swath (%d) "
    5082             :                  "(GDAL_SWATH_SIZE config. option)",
    5083             :                  GDALGetCacheMax64(), nTargetSwathSize);
    5084             :     }
    5085             : 
    5086             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    5087             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    5088             : 
    5089             :     // if both input and output datasets are tiled, that the tile dimensions
    5090             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    5091             :     // of input and output block dimensions.
    5092        3296 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    5093          43 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    5094          43 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    5095          43 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5096          43 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5097             :     {
    5098          43 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    5099          43 :                 nPixelSize <=
    5100          43 :             static_cast<GIntBig>(nTargetSwathSize))
    5101             :         {
    5102          43 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    5103          43 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    5104          43 :             if (nSwathCols == 0)
    5105           0 :                 nSwathCols = nMaxBlockXSize;
    5106          43 :             if (nSwathCols > nXSize)
    5107          41 :                 nSwathCols = nXSize;
    5108          43 :             nSwathLines = nMaxBlockYSize;
    5109             : 
    5110          43 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    5111          43 :                 static_cast<GIntBig>(nTargetSwathSize))
    5112             :             {
    5113           0 :                 nSwathCols = nXSize;
    5114           0 :                 nSwathLines = nBlockYSize;
    5115             :             }
    5116             :         }
    5117             :     }
    5118             : 
    5119        3296 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    5120        3296 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    5121        3296 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    5122             :     {
    5123           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    5124           1 :         if (nSwathLines == 0)
    5125           1 :             nSwathLines = 1;
    5126             : 
    5127           1 :         CPLDebug(
    5128             :             "GDAL",
    5129             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    5130             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    5131             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    5132           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    5133             :     }
    5134             :     // If we are processing single scans, try to handle several at once.
    5135             :     // If we are handling swaths already, only grow the swath if a row
    5136             :     // of blocks is substantially less than our target buffer size.
    5137        3295 :     else if (nSwathLines == 1 ||
    5138        2738 :              nMemoryPerCol * nSwathLines <
    5139        2738 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    5140             :     {
    5141        3267 :         nSwathLines = std::min(
    5142             :             nYSize,
    5143        3267 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    5144             : 
    5145             :         /* If possible try to align to source and target block height */
    5146        3267 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    5147         261 :             nSwathLines > nMaxBlockYSize &&
    5148         261 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5149         232 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5150         209 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    5151             :     }
    5152             : 
    5153        3296 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    5154           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    5155           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    5156             :     {
    5157             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    5158           2 :         if (nSwathLines < nSrcBlockYSize)
    5159             :         {
    5160           0 :             nSwathLines = nSrcBlockYSize;
    5161             : 
    5162             :             // Number of pixels that can be read/write simultaneously.
    5163           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    5164           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    5165           0 :             if (nSwathCols == 0)
    5166           0 :                 nSwathCols = nSrcBlockXSize;
    5167           0 :             if (nSwathCols > nXSize)
    5168           0 :                 nSwathCols = nXSize;
    5169             : 
    5170           0 :             CPLDebug(
    5171             :                 "GDAL",
    5172             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5173             :                 "too high block, "
    5174             :                 "use partial width at one time");
    5175             :         }
    5176           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    5177             :         {
    5178             :             /* Round on a multiple of nSrcBlockYSize */
    5179           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    5180           0 :             CPLDebug(
    5181             :                 "GDAL",
    5182             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5183             :                 "round nSwathLines to block height : %d",
    5184             :                 nSwathLines);
    5185             :         }
    5186             :     }
    5187        3294 :     else if (bDstIsCompressed)
    5188             :     {
    5189         415 :         if (nSwathLines < nBlockYSize)
    5190             :         {
    5191         146 :             nSwathLines = nBlockYSize;
    5192             : 
    5193             :             // Number of pixels that can be read/write simultaneously.
    5194         146 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    5195         146 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    5196         146 :             if (nSwathCols == 0)
    5197           0 :                 nSwathCols = nBlockXSize;
    5198         146 :             if (nSwathCols > nXSize)
    5199         146 :                 nSwathCols = nXSize;
    5200             : 
    5201         146 :             CPLDebug(
    5202             :                 "GDAL",
    5203             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5204             :                 "too high block, "
    5205             :                 "use partial width at one time");
    5206             :         }
    5207         269 :         else if ((nSwathLines % nBlockYSize) != 0)
    5208             :         {
    5209             :             // Round on a multiple of nBlockYSize.
    5210           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    5211           9 :             CPLDebug(
    5212             :                 "GDAL",
    5213             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5214             :                 "round nSwathLines to block height : %d",
    5215             :                 nSwathLines);
    5216             :         }
    5217             :     }
    5218             : 
    5219        3296 :     *pnSwathCols = nSwathCols;
    5220        3296 :     *pnSwathLines = nSwathLines;
    5221        3296 : }
    5222             : 
    5223             : /************************************************************************/
    5224             : /*                     GDALDatasetCopyWholeRaster()                     */
    5225             : /************************************************************************/
    5226             : 
    5227             : /**
    5228             :  * \brief Copy all dataset raster data.
    5229             :  *
    5230             :  * This function copies the complete raster contents of one dataset to
    5231             :  * another similarly configured dataset.  The source and destination
    5232             :  * dataset must have the same number of bands, and the same width
    5233             :  * and height.  The bands do not have to have the same data type.
    5234             :  *
    5235             :  * This function is primarily intended to support implementation of
    5236             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    5237             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    5238             :  * performing the transfer in a pixel interleaved fashion.
    5239             :  *
    5240             :  * Currently the only papszOptions value supported are :
    5241             :  * <ul>
    5242             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    5243             :  * write access pattern (this does not modify the layout of the destination
    5244             :  * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
    5245             :  * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
    5246             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    5247             :  * (GDAL &gt;= 2.2)</li>
    5248             :  * </ul>
    5249             :  * More options may be supported in the future.
    5250             :  *
    5251             :  * @param hSrcDS the source dataset
    5252             :  * @param hDstDS the destination dataset
    5253             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5254             :  * @param pfnProgress progress reporting function.
    5255             :  * @param pProgressData callback data for progress function.
    5256             :  *
    5257             :  * @return CE_None on success, or CE_Failure on failure.
    5258             :  */
    5259             : 
    5260        3268 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    5261             :                                               GDALDatasetH hDstDS,
    5262             :                                               CSLConstList papszOptions,
    5263             :                                               GDALProgressFunc pfnProgress,
    5264             :                                               void *pProgressData)
    5265             : 
    5266             : {
    5267        3268 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5268        3268 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5269             : 
    5270        3268 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    5271        3268 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    5272             : 
    5273        3268 :     if (pfnProgress == nullptr)
    5274           0 :         pfnProgress = GDALDummyProgress;
    5275             : 
    5276             :     /* -------------------------------------------------------------------- */
    5277             :     /*      Confirm the datasets match in size and band counts.             */
    5278             :     /* -------------------------------------------------------------------- */
    5279        3268 :     const int nXSize = poDstDS->GetRasterXSize();
    5280        3268 :     const int nYSize = poDstDS->GetRasterYSize();
    5281        3268 :     const int nBandCount = poDstDS->GetRasterCount();
    5282             : 
    5283        3268 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    5284        6536 :         poSrcDS->GetRasterYSize() != nYSize ||
    5285        3268 :         poSrcDS->GetRasterCount() != nBandCount)
    5286             :     {
    5287           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5288             :                  "Input and output dataset sizes or band counts do not\n"
    5289             :                  "match in GDALDatasetCopyWholeRaster()");
    5290           0 :         return CE_Failure;
    5291             :     }
    5292             : 
    5293             :     /* -------------------------------------------------------------------- */
    5294             :     /*      Report preliminary (0) progress.                                */
    5295             :     /* -------------------------------------------------------------------- */
    5296        3268 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5297             :     {
    5298           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5299             :                  "User terminated CreateCopy()");
    5300           1 :         return CE_Failure;
    5301             :     }
    5302             : 
    5303             :     /* -------------------------------------------------------------------- */
    5304             :     /*      Get our prototype band, and assume the others are similarly     */
    5305             :     /*      configured.                                                     */
    5306             :     /* -------------------------------------------------------------------- */
    5307        3267 :     if (nBandCount == 0)
    5308           0 :         return CE_None;
    5309             : 
    5310        3267 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    5311        3267 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    5312        3267 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5313             : 
    5314             :     /* -------------------------------------------------------------------- */
    5315             :     /*      Do we want to try and do the operation in a pixel               */
    5316             :     /*      interleaved fashion?                                            */
    5317             :     /* -------------------------------------------------------------------- */
    5318        3267 :     bool bInterleave = false;
    5319             :     const char *pszInterleave =
    5320        3267 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5321        3267 :     if (pszInterleave != nullptr &&
    5322        2874 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5323         189 :         bInterleave = true;
    5324             : 
    5325        3267 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5326        3267 :     if (pszInterleave != nullptr &&
    5327        2797 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5328         503 :         bInterleave = true;
    5329             : 
    5330        3267 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    5331        3267 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    5332           5 :         bInterleave = true;
    5333        3262 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    5334          13 :         bInterleave = false;
    5335             :     // attributes is specific to the TileDB driver
    5336        3249 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    5337           4 :         bInterleave = true;
    5338        3245 :     else if (pszInterleave != nullptr)
    5339             :     {
    5340           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    5341             :                  "Unsupported value for option INTERLEAVE");
    5342             :     }
    5343             : 
    5344             :     // If the destination is compressed, we must try to write blocks just once,
    5345             :     // to save disk space (GTiff case for example), and to avoid data loss
    5346             :     // (JPEG compression for example).
    5347        3267 :     bool bDstIsCompressed = false;
    5348             :     const char *pszDstCompressed =
    5349        3267 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    5350        3267 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5351         389 :         bDstIsCompressed = true;
    5352             : 
    5353             :     /* -------------------------------------------------------------------- */
    5354             :     /*      What will our swath size be?                                    */
    5355             :     /* -------------------------------------------------------------------- */
    5356             : 
    5357        3267 :     int nSwathCols = 0;
    5358        3267 :     int nSwathLines = 0;
    5359        3267 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    5360             :                                     nBandCount, bDstIsCompressed, bInterleave,
    5361             :                                     &nSwathCols, &nSwathLines);
    5362             : 
    5363        3267 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5364        3267 :     if (bInterleave)
    5365         556 :         nPixelSize *= nBandCount;
    5366             : 
    5367        3267 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5368        3267 :     if (pSwathBuf == nullptr)
    5369             :     {
    5370           0 :         return CE_Failure;
    5371             :     }
    5372             : 
    5373        3267 :     CPLDebug("GDAL",
    5374             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    5375             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    5376             : 
    5377             :     // Advise the source raster that we are going to read it completely
    5378             :     // Note: this might already have been done by GDALCreateCopy() in the
    5379             :     // likely case this function is indirectly called by it
    5380        3267 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    5381        3267 :                         nullptr, nullptr);
    5382             : 
    5383             :     /* ==================================================================== */
    5384             :     /*      Band oriented (uninterleaved) case.                             */
    5385             :     /* ==================================================================== */
    5386        3267 :     CPLErr eErr = CE_None;
    5387             :     const bool bCheckHoles =
    5388        3267 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5389             : 
    5390        3267 :     if (!bInterleave)
    5391             :     {
    5392             :         GDALRasterIOExtraArg sExtraArg;
    5393        2711 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5394        2711 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5395             : 
    5396        8133 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    5397        2711 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    5398        2711 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    5399        2711 :         GIntBig nBlocksDone = 0;
    5400             : 
    5401        7838 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    5402             :         {
    5403        5127 :             int nBand = iBand + 1;
    5404             : 
    5405       10512 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5406             :             {
    5407        5385 :                 int nThisLines = nSwathLines;
    5408             : 
    5409        5385 :                 if (iY + nThisLines > nYSize)
    5410         363 :                     nThisLines = nYSize - iY;
    5411             : 
    5412       10770 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    5413        5385 :                      iX += nSwathCols)
    5414             :                 {
    5415        5385 :                     int nThisCols = nSwathCols;
    5416             : 
    5417        5385 :                     if (iX + nThisCols > nXSize)
    5418           0 :                         nThisCols = nXSize - iX;
    5419             : 
    5420        5385 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5421        5385 :                     if (bCheckHoles)
    5422             :                     {
    5423             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    5424        3722 :                                       ->GetDataCoverageStatus(
    5425             :                                           iX, iY, nThisCols, nThisLines,
    5426             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    5427             :                     }
    5428        5385 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5429             :                     {
    5430        5381 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    5431       10762 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    5432        5381 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    5433        5381 :                             (nBlocksDone + 0.5) /
    5434        5381 :                                 static_cast<double>(nTotalBlocks),
    5435             :                             pfnProgress, pProgressData);
    5436        5381 :                         if (sExtraArg.pProgressData == nullptr)
    5437        1633 :                             sExtraArg.pfnProgress = nullptr;
    5438             : 
    5439        5381 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5440             :                                                  nThisLines, pSwathBuf,
    5441             :                                                  nThisCols, nThisLines, eDT, 1,
    5442             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    5443             : 
    5444        5381 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5445             : 
    5446        5381 :                         if (eErr == CE_None)
    5447        5374 :                             eErr = poDstDS->RasterIO(
    5448             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    5449             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    5450             :                                 &nBand, 0, 0, 0, nullptr);
    5451             :                     }
    5452             : 
    5453        5385 :                     nBlocksDone++;
    5454       10728 :                     if (eErr == CE_None &&
    5455        5343 :                         !pfnProgress(nBlocksDone /
    5456        5343 :                                          static_cast<double>(nTotalBlocks),
    5457             :                                      nullptr, pProgressData))
    5458             :                     {
    5459           2 :                         eErr = CE_Failure;
    5460           2 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    5461             :                                  "User terminated CreateCopy()");
    5462             :                     }
    5463             :                 }
    5464             :             }
    5465             :         }
    5466             :     }
    5467             : 
    5468             :     /* ==================================================================== */
    5469             :     /*      Pixel interleaved case.                                         */
    5470             :     /* ==================================================================== */
    5471             :     else /* if( bInterleave ) */
    5472             :     {
    5473             :         GDALRasterIOExtraArg sExtraArg;
    5474         556 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5475         556 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5476             : 
    5477         556 :         const GIntBig nTotalBlocks =
    5478         556 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    5479         556 :             DIV_ROUND_UP(nXSize, nSwathCols);
    5480         556 :         GIntBig nBlocksDone = 0;
    5481             : 
    5482        1332 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5483             :         {
    5484         776 :             int nThisLines = nSwathLines;
    5485             : 
    5486         776 :             if (iY + nThisLines > nYSize)
    5487         195 :                 nThisLines = nYSize - iY;
    5488             : 
    5489        1558 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5490             :             {
    5491         782 :                 int nThisCols = nSwathCols;
    5492             : 
    5493         782 :                 if (iX + nThisCols > nXSize)
    5494           4 :                     nThisCols = nXSize - iX;
    5495             : 
    5496         782 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5497         782 :                 if (bCheckHoles)
    5498             :                 {
    5499         549 :                     nStatus = 0;
    5500         602 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    5501             :                     {
    5502         583 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    5503         583 :                                        ->GetDataCoverageStatus(
    5504             :                                            iX, iY, nThisCols, nThisLines,
    5505             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    5506         583 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5507         530 :                             break;
    5508             :                     }
    5509             :                 }
    5510         782 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5511             :                 {
    5512         763 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    5513        1526 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    5514         763 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    5515         763 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    5516             :                         pfnProgress, pProgressData);
    5517         763 :                     if (sExtraArg.pProgressData == nullptr)
    5518         348 :                         sExtraArg.pfnProgress = nullptr;
    5519             : 
    5520         763 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5521             :                                              nThisLines, pSwathBuf, nThisCols,
    5522             :                                              nThisLines, eDT, nBandCount,
    5523             :                                              nullptr, 0, 0, 0, &sExtraArg);
    5524             : 
    5525         763 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5526             : 
    5527         763 :                     if (eErr == CE_None)
    5528         761 :                         eErr = poDstDS->RasterIO(
    5529             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    5530             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    5531             :                             0, 0, nullptr);
    5532             :                 }
    5533             : 
    5534         782 :                 nBlocksDone++;
    5535        1559 :                 if (eErr == CE_None &&
    5536         777 :                     !pfnProgress(nBlocksDone /
    5537         777 :                                      static_cast<double>(nTotalBlocks),
    5538             :                                  nullptr, pProgressData))
    5539             :                 {
    5540           0 :                     eErr = CE_Failure;
    5541           0 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    5542             :                              "User terminated CreateCopy()");
    5543             :                 }
    5544             :             }
    5545             :         }
    5546             :     }
    5547             : 
    5548             :     /* -------------------------------------------------------------------- */
    5549             :     /*      Cleanup                                                         */
    5550             :     /* -------------------------------------------------------------------- */
    5551        3267 :     CPLFree(pSwathBuf);
    5552             : 
    5553        3267 :     return eErr;
    5554             : }
    5555             : 
    5556             : /************************************************************************/
    5557             : /*                     GDALRasterBandCopyWholeRaster()                  */
    5558             : /************************************************************************/
    5559             : 
    5560             : /**
    5561             :  * \brief Copy a whole raster band
    5562             :  *
    5563             :  * This function copies the complete raster contents of one band to
    5564             :  * another similarly configured band.  The source and destination
    5565             :  * bands must have the same width and height.  The bands do not have
    5566             :  * to have the same data type.
    5567             :  *
    5568             :  * It implements efficient copying, in particular "chunking" the copy in
    5569             :  * substantial blocks.
    5570             :  *
    5571             :  * Currently the only papszOptions value supported are :
    5572             :  * <ul>
    5573             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    5574             :  * achieve best compression.</li>
    5575             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    5576             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    5577             :  * </ul>
    5578             :  *
    5579             :  * @param hSrcBand the source band
    5580             :  * @param hDstBand the destination band
    5581             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5582             :  * @param pfnProgress progress reporting function.
    5583             :  * @param pProgressData callback data for progress function.
    5584             :  *
    5585             :  * @return CE_None on success, or CE_Failure on failure.
    5586             :  */
    5587             : 
    5588          29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    5589             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    5590             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    5591             :     void *pProgressData)
    5592             : 
    5593             : {
    5594          29 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5595          29 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5596             : 
    5597          29 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    5598          29 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    5599          29 :     CPLErr eErr = CE_None;
    5600             : 
    5601          29 :     if (pfnProgress == nullptr)
    5602           2 :         pfnProgress = GDALDummyProgress;
    5603             : 
    5604             :     /* -------------------------------------------------------------------- */
    5605             :     /*      Confirm the datasets match in size and band counts.             */
    5606             :     /* -------------------------------------------------------------------- */
    5607          29 :     int nXSize = poSrcBand->GetXSize();
    5608          29 :     int nYSize = poSrcBand->GetYSize();
    5609             : 
    5610          29 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    5611             :     {
    5612           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5613             :                  "Input and output band sizes do not\n"
    5614             :                  "match in GDALRasterBandCopyWholeRaster()");
    5615           0 :         return CE_Failure;
    5616             :     }
    5617             : 
    5618             :     /* -------------------------------------------------------------------- */
    5619             :     /*      Report preliminary (0) progress.                                */
    5620             :     /* -------------------------------------------------------------------- */
    5621          29 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5622             :     {
    5623           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5624             :                  "User terminated CreateCopy()");
    5625           0 :         return CE_Failure;
    5626             :     }
    5627             : 
    5628          29 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5629             : 
    5630             :     // If the destination is compressed, we must try to write blocks just once,
    5631             :     // to save disk space (GTiff case for example), and to avoid data loss
    5632             :     // (JPEG compression for example).
    5633          29 :     bool bDstIsCompressed = false;
    5634             :     const char *pszDstCompressed =
    5635          29 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5636          29 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5637          26 :         bDstIsCompressed = true;
    5638             : 
    5639             :     /* -------------------------------------------------------------------- */
    5640             :     /*      What will our swath size be?                                    */
    5641             :     /* -------------------------------------------------------------------- */
    5642             : 
    5643          29 :     int nSwathCols = 0;
    5644          29 :     int nSwathLines = 0;
    5645          29 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5646             :                                     FALSE, &nSwathCols, &nSwathLines);
    5647             : 
    5648          29 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5649             : 
    5650          29 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5651          29 :     if (pSwathBuf == nullptr)
    5652             :     {
    5653           0 :         return CE_Failure;
    5654             :     }
    5655             : 
    5656          29 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5657             :              nSwathCols, nSwathLines);
    5658             : 
    5659             :     const bool bCheckHoles =
    5660          29 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5661             : 
    5662             :     // Advise the source raster that we are going to read it completely
    5663          29 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    5664             : 
    5665             :     /* ==================================================================== */
    5666             :     /*      Band oriented (uninterleaved) case.                             */
    5667             :     /* ==================================================================== */
    5668             : 
    5669          72 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5670             :     {
    5671          43 :         int nThisLines = nSwathLines;
    5672             : 
    5673          43 :         if (iY + nThisLines > nYSize)
    5674           8 :             nThisLines = nYSize - iY;
    5675             : 
    5676          86 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5677             :         {
    5678          43 :             int nThisCols = nSwathCols;
    5679             : 
    5680          43 :             if (iX + nThisCols > nXSize)
    5681           0 :                 nThisCols = nXSize - iX;
    5682             : 
    5683          43 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5684          43 :             if (bCheckHoles)
    5685             :             {
    5686           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5687             :                     iX, iY, nThisCols, nThisLines,
    5688             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5689             :             }
    5690          43 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5691             :             {
    5692          43 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5693             :                                            nThisLines, pSwathBuf, nThisCols,
    5694             :                                            nThisLines, eDT, 0, 0, nullptr);
    5695             : 
    5696          43 :                 if (eErr == CE_None)
    5697          43 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5698             :                                                nThisLines, pSwathBuf, nThisCols,
    5699             :                                                nThisLines, eDT, 0, 0, nullptr);
    5700             :             }
    5701             : 
    5702          86 :             if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
    5703          43 :                                                     static_cast<double>(nYSize),
    5704             :                                                 nullptr, pProgressData))
    5705             :             {
    5706           0 :                 eErr = CE_Failure;
    5707           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5708             :                          "User terminated CreateCopy()");
    5709             :             }
    5710             :         }
    5711             :     }
    5712             : 
    5713             :     /* -------------------------------------------------------------------- */
    5714             :     /*      Cleanup                                                         */
    5715             :     /* -------------------------------------------------------------------- */
    5716          29 :     CPLFree(pSwathBuf);
    5717             : 
    5718          29 :     return eErr;
    5719             : }
    5720             : 
    5721             : /************************************************************************/
    5722             : /*                      GDALCopyRasterIOExtraArg ()                     */
    5723             : /************************************************************************/
    5724             : 
    5725      527273 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5726             :                               GDALRasterIOExtraArg *psSrcArg)
    5727             : {
    5728      527273 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5729      527273 :     if (psSrcArg)
    5730             :     {
    5731      527273 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5732      527273 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5733      527273 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5734      527273 :         psDestArg->bFloatingPointWindowValidity =
    5735      527273 :             psSrcArg->bFloatingPointWindowValidity;
    5736      527273 :         if (psSrcArg->bFloatingPointWindowValidity)
    5737             :         {
    5738      204391 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5739      204391 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5740      204391 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5741      204391 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5742             :         }
    5743      527273 :         if (psSrcArg->nVersion >= 2)
    5744             :         {
    5745      527273 :             psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
    5746             :         }
    5747             :     }
    5748      527273 : }
    5749             : 
    5750             : /************************************************************************/
    5751             : /*                         HasOnlyNoData()                              */
    5752             : /************************************************************************/
    5753             : 
    5754    25110502 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    5755             : {
    5756    25110502 :     return value == noDataValue;
    5757             : }
    5758             : 
    5759        5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
    5760             : {
    5761             :     using std::isnan;
    5762        5509 :     return isnan(noDataValue) ? isnan(value) : value == noDataValue;
    5763             : }
    5764             : 
    5765      250690 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    5766             : {
    5767      250690 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5768             : }
    5769             : 
    5770      263861 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    5771             : {
    5772      263861 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5773             : }
    5774             : 
    5775             : template <class T>
    5776       11660 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    5777             :                            size_t nHeight, size_t nLineStride,
    5778             :                            size_t nComponents)
    5779             : {
    5780             :     // Fast test: check the 4 corners and the middle pixel.
    5781       22790 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    5782             :     {
    5783       23528 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    5784       11679 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    5785       11599 :                               noDataValue) &&
    5786       11599 :               IsEqualToNoData(
    5787       11599 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    5788       11599 :                               nComponents +
    5789             :                           iBand],
    5790       11133 :                   noDataValue) &&
    5791       11133 :               IsEqualToNoData(
    5792       11133 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    5793             :                   noDataValue) &&
    5794       11133 :               IsEqualToNoData(
    5795       11133 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    5796       11133 :                               nComponents +
    5797             :                           iBand],
    5798             :                   noDataValue)))
    5799             :         {
    5800         719 :             return false;
    5801             :         }
    5802             :     }
    5803             : 
    5804             :     // Test all pixels.
    5805       37304 :     for (size_t iY = 0; iY < nHeight; iY++)
    5806             :     {
    5807       26466 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    5808    25599518 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    5809             :         {
    5810    25573175 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    5811             :             {
    5812         103 :                 return false;
    5813             :             }
    5814             :         }
    5815             :     }
    5816       10838 :     return true;
    5817             : }
    5818             : 
    5819             : /************************************************************************/
    5820             : /*                    GDALBufferHasOnlyNoData()                         */
    5821             : /************************************************************************/
    5822             : 
    5823       43435 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    5824             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    5825             :                              size_t nComponents, int nBitsPerSample,
    5826             :                              GDALBufferSampleFormat nSampleFormat)
    5827             : {
    5828             :     // In the case where the nodata is 0, we can compare several bytes at
    5829             :     // once. Select the largest natural integer type for the architecture.
    5830       43435 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    5831             :         // Do not use this optimized code path for floating point numbers,
    5832             :         // as it can't detect negative zero.
    5833             :         nSampleFormat != GSF_FLOATING_POINT)
    5834             :     {
    5835       27219 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    5836       27219 :         const size_t nSize =
    5837       27219 :             static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
    5838       27219 :                                      nComponents * nBitsPerSample +
    5839             :                                  7) /
    5840             :                                 8);
    5841             : #ifdef HAVE_SSE2
    5842       27219 :         size_t n = nSize;
    5843             :         // Align to 16 bytes
    5844       27282 :         while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
    5845             :         {
    5846          73 :             --n;
    5847          73 :             if (*pabyBuffer)
    5848          10 :                 return false;
    5849          63 :             pabyBuffer++;
    5850             :         }
    5851             : 
    5852       27209 :         const auto zero = _mm_setzero_si128();
    5853       27209 :         constexpr int UNROLLING = 4;
    5854     2085580 :         while (n >= UNROLLING * sizeof(zero))
    5855             :         {
    5856     2070360 :             const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    5857             :                 pabyBuffer + 0 * sizeof(zero)));
    5858     2070360 :             const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    5859     2070360 :                 pabyBuffer + 1 * sizeof(zero)));
    5860     2070360 :             const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    5861     2070360 :                 pabyBuffer + 2 * sizeof(zero)));
    5862     2070360 :             const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    5863     2070360 :                 pabyBuffer + 3 * sizeof(zero)));
    5864             :             const auto v =
    5865     6211070 :                 _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    5866             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    5867             :             if (!_mm_test_all_zeros(v, v))
    5868             : #else
    5869     4140720 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    5870             : #endif
    5871             :             {
    5872       11982 :                 return false;
    5873             :             }
    5874     2058380 :             pabyBuffer += UNROLLING * sizeof(zero);
    5875     2058380 :             n -= UNROLLING * sizeof(zero);
    5876             :         }
    5877             : 
    5878      233552 :         while (n > 0)
    5879             :         {
    5880      218425 :             --n;
    5881      218425 :             if (*pabyBuffer)
    5882         100 :                 return false;
    5883      218325 :             pabyBuffer++;
    5884             :         }
    5885             : #else
    5886             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    5887             :         // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    5888             :         typedef std::uint64_t WordType;
    5889             : #else
    5890             :         typedef std::uint32_t WordType;
    5891             : #endif
    5892             : 
    5893             :         const size_t nInitialIters =
    5894             :             std::min(sizeof(WordType) -
    5895             :                          static_cast<size_t>(
    5896             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    5897             :                              sizeof(WordType)),
    5898             :                      nSize);
    5899             :         size_t i = 0;
    5900             :         for (; i < nInitialIters; i++)
    5901             :         {
    5902             :             if (pabyBuffer[i])
    5903             :                 return false;
    5904             :         }
    5905             :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    5906             :         {
    5907             :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    5908             :                 return false;
    5909             :         }
    5910             :         for (; i < nSize; i++)
    5911             :         {
    5912             :             if (pabyBuffer[i])
    5913             :                 return false;
    5914             :         }
    5915             : #endif
    5916       15127 :         return true;
    5917             :     }
    5918             : 
    5919             : #ifdef HAVE_SSE2
    5920       16216 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    5921         708 :              nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    5922             :     {
    5923         708 :         const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
    5924         708 :         const auto zero = _mm_setzero_si128();
    5925         708 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    5926         708 :         const size_t n = nWidth * nHeight * nComponents;
    5927             : 
    5928         708 :         size_t i = 0;
    5929         708 :         constexpr int UNROLLING = 4;
    5930         708 :         constexpr size_t VALUES_PER_ITER =
    5931             :             UNROLLING * sizeof(zero) / sizeof(float);
    5932       24983 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    5933             :         {
    5934       24934 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    5935             :                 pabyBuffer + 0 * sizeof(zero)));
    5936       24934 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    5937       24934 :                 pabyBuffer + 1 * sizeof(zero)));
    5938       24934 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    5939       24934 :                 pabyBuffer + 2 * sizeof(zero)));
    5940       24934 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    5941       24934 :                 pabyBuffer + 3 * sizeof(zero)));
    5942       74802 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    5943             :             // Clear the sign bit (makes -0.0 become +0.0)
    5944       24934 :             v = _mm_and_si128(v, signMask);
    5945             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    5946             :             if (!_mm_test_all_zeros(v, v))
    5947             : #else
    5948       49868 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    5949             : #endif
    5950             :             {
    5951         659 :                 return false;
    5952             :             }
    5953       24275 :             pabyBuffer += UNROLLING * sizeof(zero);
    5954             :         }
    5955             : 
    5956         304 :         for (; i < n; i++)
    5957             :         {
    5958             :             uint32_t bits;
    5959         272 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    5960         272 :             pabyBuffer += sizeof(bits);
    5961         272 :             if ((bits & 0x7FFFFFFF) != 0)
    5962          17 :                 return false;
    5963             :         }
    5964             : 
    5965          32 :         return true;
    5966             :     }
    5967             : 
    5968       15508 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    5969        3841 :              nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    5970             :     {
    5971        3841 :         const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
    5972        3841 :         const auto zero = _mm_setzero_si128();
    5973        3841 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    5974        3841 :         const size_t n = nWidth * nHeight * nComponents;
    5975             : 
    5976        3841 :         size_t i = 0;
    5977        3841 :         constexpr int UNROLLING = 4;
    5978        3841 :         constexpr size_t VALUES_PER_ITER =
    5979             :             UNROLLING * sizeof(zero) / sizeof(double);
    5980     1664320 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    5981             :         {
    5982     1660710 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    5983             :                 pabyBuffer + 0 * sizeof(zero)));
    5984     1660710 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    5985     1660710 :                 pabyBuffer + 1 * sizeof(zero)));
    5986     1660710 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    5987     1660710 :                 pabyBuffer + 2 * sizeof(zero)));
    5988     1660710 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    5989     1660710 :                 pabyBuffer + 3 * sizeof(zero)));
    5990     4982130 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    5991             :             // Clear the sign bit (makes -0.0 become +0.0)
    5992     1660710 :             v = _mm_and_si128(v, signMask);
    5993             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    5994             :             if (!_mm_test_all_zeros(v, v))
    5995             : #else
    5996     3321420 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    5997             : #endif
    5998             :             {
    5999         227 :                 return false;
    6000             :             }
    6001     1660480 :             pabyBuffer += UNROLLING * sizeof(zero);
    6002             :         }
    6003             : 
    6004        3633 :         for (; i < n; i++)
    6005             :         {
    6006             :             uint64_t bits;
    6007          26 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6008          26 :             pabyBuffer += sizeof(bits);
    6009          26 :             if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
    6010           7 :                 return false;
    6011             :         }
    6012             : 
    6013        3607 :         return true;
    6014             :     }
    6015             : #endif
    6016             : 
    6017       11667 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    6018             :     {
    6019       22274 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    6020       11137 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    6021       11137 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    6022       11137 :                               nHeight, nLineStride, nComponents);
    6023             :     }
    6024         530 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    6025             :     {
    6026             :         // Use unsigned implementation by converting the nodatavalue to
    6027             :         // unsigned
    6028          63 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    6029          31 :                HasOnlyNoDataT(
    6030             :                    static_cast<const uint8_t *>(pBuffer),
    6031          31 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    6032          32 :                    nWidth, nHeight, nLineStride, nComponents);
    6033             :     }
    6034         498 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    6035             :     {
    6036          23 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    6037          11 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    6038          11 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    6039          12 :                               nHeight, nLineStride, nComponents);
    6040             :     }
    6041         486 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    6042             :     {
    6043             :         // Use unsigned implementation by converting the nodatavalue to
    6044             :         // unsigned
    6045          99 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    6046          49 :                HasOnlyNoDataT(
    6047             :                    static_cast<const uint16_t *>(pBuffer),
    6048          49 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    6049          50 :                    nWidth, nHeight, nLineStride, nComponents);
    6050             :     }
    6051         436 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    6052             :     {
    6053          73 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    6054          36 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    6055             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    6056          37 :                               nHeight, nLineStride, nComponents);
    6057             :     }
    6058         399 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    6059             :     {
    6060             :         // Use unsigned implementation by converting the nodatavalue to
    6061             :         // unsigned
    6062          23 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    6063          11 :                HasOnlyNoDataT(
    6064             :                    static_cast<const uint32_t *>(pBuffer),
    6065          11 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    6066          12 :                    nWidth, nHeight, nLineStride, nComponents);
    6067             :     }
    6068         387 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    6069             :     {
    6070          56 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    6071          28 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    6072             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    6073          28 :                               nHeight, nLineStride, nComponents);
    6074             :     }
    6075         359 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    6076             :     {
    6077             :         // Use unsigned implementation by converting the nodatavalue to
    6078             :         // unsigned
    6079           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    6080           0 :                HasOnlyNoDataT(
    6081             :                    static_cast<const uint64_t *>(pBuffer),
    6082           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    6083           0 :                    nWidth, nHeight, nLineStride, nComponents);
    6084             :     }
    6085         359 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
    6086             :     {
    6087         106 :         return (std::isnan(dfNoDataValue) ||
    6088         211 :                 GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
    6089         105 :                HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
    6090             :                               static_cast<GFloat16>(dfNoDataValue), nWidth,
    6091         106 :                               nHeight, nLineStride, nComponents);
    6092             :     }
    6093         253 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6094             :     {
    6095         153 :         return (std::isnan(dfNoDataValue) ||
    6096         305 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    6097         152 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    6098             :                               static_cast<float>(dfNoDataValue), nWidth,
    6099         153 :                               nHeight, nLineStride, nComponents);
    6100             :     }
    6101         100 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6102             :     {
    6103         100 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    6104             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    6105         100 :                               nComponents);
    6106             :     }
    6107           0 :     return false;
    6108             : }
    6109             : 
    6110             : #ifdef HAVE_SSE2
    6111             : 
    6112             : /************************************************************************/
    6113             : /*                    GDALDeinterleave3Byte()                           */
    6114             : /************************************************************************/
    6115             : 
    6116             : #if defined(__GNUC__) && !defined(__clang__)
    6117             : __attribute__((optimize("no-tree-vectorize")))
    6118             : #endif
    6119             : static void
    6120      361353 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6121             :                       GByte *CPL_RESTRICT pabyDest0,
    6122             :                       GByte *CPL_RESTRICT pabyDest1,
    6123             :                       GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6124             : #ifdef USE_NEON_OPTIMIZATIONS
    6125             : {
    6126             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6127             :                                        nIters);
    6128             : }
    6129             : #else
    6130             : {
    6131             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6132      361353 :     if (CPLHaveRuntimeSSSE3())
    6133             :     {
    6134      361351 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6135      361351 :                                            pabyDest2, nIters);
    6136             :     }
    6137             : #endif
    6138             : 
    6139           2 :     size_t i = 0;
    6140           2 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    6141           2 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    6142           2 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    6143           2 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    6144             :          sizeof(unsigned int)) == 0)
    6145             :     {
    6146             :         // Slightly better than GCC autovectorizer
    6147          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    6148             :         {
    6149          15 :             unsigned int word0 =
    6150          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    6151          15 :             unsigned int word1 =
    6152          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    6153          15 :             unsigned int word2 =
    6154          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    6155          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    6156          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    6157          15 :                 ((word2 >> 8) << 24);
    6158          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    6159          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    6160          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    6161          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    6162          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    6163          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    6164          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    6165             :         }
    6166             :     }
    6167             : #if defined(__clang__)
    6168             : #pragma clang loop vectorize(disable)
    6169             : #endif
    6170           3 :     for (; i < nIters; ++i)
    6171             :     {
    6172           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    6173           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    6174           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    6175             :     }
    6176             : }
    6177             : #endif
    6178             : 
    6179             : /************************************************************************/
    6180             : /*                    GDALDeinterleave4Byte()                           */
    6181             : /************************************************************************/
    6182             : 
    6183             : #if !defined(__GNUC__) || defined(__clang__)
    6184             : 
    6185             : /************************************************************************/
    6186             : /*                         deinterleave()                               */
    6187             : /************************************************************************/
    6188             : 
    6189             : template <bool SHIFT, bool MASK>
    6190             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    6191             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    6192             : {
    6193             :     // Set higher 24bit of each int32 packed word to 0
    6194             :     if (SHIFT)
    6195             :     {
    6196             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    6197             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    6198             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    6199             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    6200             :     }
    6201             :     __m128i xmm0;
    6202             :     __m128i xmm1;
    6203             :     __m128i xmm2;
    6204             :     __m128i xmm3;
    6205             :     if (MASK)
    6206             :     {
    6207             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    6208             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    6209             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    6210             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    6211             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    6212             :     }
    6213             :     else
    6214             :     {
    6215             :         xmm0 = xmm0_ori;
    6216             :         xmm1 = xmm1_ori;
    6217             :         xmm2 = xmm2_ori;
    6218             :         xmm3 = xmm3_ori;
    6219             :     }
    6220             :     // Pack int32 to int16
    6221             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    6222             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    6223             :     // Pack int16 to uint8
    6224             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    6225             :     return xmm0;
    6226             : }
    6227             : 
    6228             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6229             :                                   GByte *CPL_RESTRICT pabyDest0,
    6230             :                                   GByte *CPL_RESTRICT pabyDest1,
    6231             :                                   GByte *CPL_RESTRICT pabyDest2,
    6232             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6233             : #ifdef USE_NEON_OPTIMIZATIONS
    6234             : {
    6235             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6236             :                                        pabyDest3, nIters);
    6237             : }
    6238             : #else
    6239             : {
    6240             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6241             :     if (CPLHaveRuntimeSSSE3())
    6242             :     {
    6243             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6244             :                                            pabyDest2, pabyDest3, nIters);
    6245             :     }
    6246             : #endif
    6247             : 
    6248             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    6249             :     // do something slightly better.
    6250             :     size_t i = 0;
    6251             :     for (; i + 15 < nIters; i += 16)
    6252             :     {
    6253             :         __m128i xmm0_ori = _mm_loadu_si128(
    6254             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    6255             :         __m128i xmm1_ori = _mm_loadu_si128(
    6256             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    6257             :         __m128i xmm2_ori = _mm_loadu_si128(
    6258             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    6259             :         __m128i xmm3_ori = _mm_loadu_si128(
    6260             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    6261             : 
    6262             :         _mm_storeu_si128(
    6263             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    6264             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6265             :         _mm_storeu_si128(
    6266             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    6267             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6268             :         _mm_storeu_si128(
    6269             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    6270             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6271             :         _mm_storeu_si128(
    6272             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    6273             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6274             :     }
    6275             : 
    6276             : #if defined(__clang__)
    6277             : #pragma clang loop vectorize(disable)
    6278             : #endif
    6279             :     for (; i < nIters; ++i)
    6280             :     {
    6281             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6282             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6283             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6284             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6285             :     }
    6286             : }
    6287             : #endif
    6288             : #else
    6289             : // GCC autovectorizer does an excellent job
    6290       62363 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    6291             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    6292             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    6293             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6294             : {
    6295   537151000 :     for (size_t i = 0; i < nIters; ++i)
    6296             :     {
    6297   537089000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    6298   537089000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    6299   537089000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    6300   537089000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    6301             :     }
    6302       62363 : }
    6303             : #endif
    6304             : 
    6305             : #else
    6306             : 
    6307             : /************************************************************************/
    6308             : /*                    GDALDeinterleave3Byte()                           */
    6309             : /************************************************************************/
    6310             : 
    6311             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    6312             : // how to auto-vectorize
    6313             : // #if defined(__GNUC__)
    6314             : //__attribute__((optimize("tree-vectorize")))
    6315             : // #endif
    6316             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6317             :                                   GByte *CPL_RESTRICT pabyDest0,
    6318             :                                   GByte *CPL_RESTRICT pabyDest1,
    6319             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6320             : {
    6321             :     for (size_t i = 0; i < nIters; ++i)
    6322             :     {
    6323             :         pabyDest0[i] = pabySrc[3 * i + 0];
    6324             :         pabyDest1[i] = pabySrc[3 * i + 1];
    6325             :         pabyDest2[i] = pabySrc[3 * i + 2];
    6326             :     }
    6327             : }
    6328             : 
    6329             : /************************************************************************/
    6330             : /*                    GDALDeinterleave4Byte()                           */
    6331             : /************************************************************************/
    6332             : 
    6333             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    6334             : // how to auto-vectorize
    6335             : // #if defined(__GNUC__)
    6336             : //__attribute__((optimize("tree-vectorize")))
    6337             : // #endif
    6338             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6339             :                                   GByte *CPL_RESTRICT pabyDest0,
    6340             :                                   GByte *CPL_RESTRICT pabyDest1,
    6341             :                                   GByte *CPL_RESTRICT pabyDest2,
    6342             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6343             : {
    6344             :     for (size_t i = 0; i < nIters; ++i)
    6345             :     {
    6346             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6347             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6348             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6349             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6350             :     }
    6351             : }
    6352             : 
    6353             : #endif
    6354             : 
    6355             : /************************************************************************/
    6356             : /*                      GDALDeinterleave()                              */
    6357             : /************************************************************************/
    6358             : 
    6359             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    6360             :     buffers.
    6361             : 
    6362             :     In pseudo-code
    6363             :     \verbatim
    6364             :     for(size_t i = 0; i < nIters; ++i)
    6365             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    6366             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    6367             :     \endverbatim
    6368             : 
    6369             :     The implementation is optimized for a few cases, like de-interleaving
    6370             :     of 3 or 4-components Byte buffers.
    6371             : 
    6372             :     \since GDAL 3.6
    6373             :  */
    6374      424066 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    6375             :                       int nComponents, void **ppDestBuffer,
    6376             :                       GDALDataType eDestDT, size_t nIters)
    6377             : {
    6378      424066 :     if (eSourceDT == eDestDT)
    6379             :     {
    6380      424044 :         if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
    6381             :         {
    6382      423723 :             if (nComponents == 3)
    6383             :             {
    6384      361353 :                 const GByte *CPL_RESTRICT pabySrc =
    6385             :                     static_cast<const GByte *>(pSourceBuffer);
    6386      361353 :                 GByte *CPL_RESTRICT pabyDest0 =
    6387             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6388      361353 :                 GByte *CPL_RESTRICT pabyDest1 =
    6389             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6390      361353 :                 GByte *CPL_RESTRICT pabyDest2 =
    6391             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6392      361353 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6393             :                                       nIters);
    6394      361353 :                 return;
    6395             :             }
    6396       62370 :             else if (nComponents == 4)
    6397             :             {
    6398       62363 :                 const GByte *CPL_RESTRICT pabySrc =
    6399             :                     static_cast<const GByte *>(pSourceBuffer);
    6400       62363 :                 GByte *CPL_RESTRICT pabyDest0 =
    6401             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6402       62363 :                 GByte *CPL_RESTRICT pabyDest1 =
    6403             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6404       62363 :                 GByte *CPL_RESTRICT pabyDest2 =
    6405             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6406       62363 :                 GByte *CPL_RESTRICT pabyDest3 =
    6407             :                     static_cast<GByte *>(ppDestBuffer[3]);
    6408       62363 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6409             :                                       pabyDest3, nIters);
    6410       62363 :                 return;
    6411           7 :             }
    6412             :         }
    6413             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    6414             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    6415             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    6416         642 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    6417         321 :                  CPLHaveRuntimeSSSE3())
    6418             :         {
    6419         321 :             if (nComponents == 3)
    6420             :             {
    6421         126 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6422             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6423         126 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6424             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6425         126 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6426             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6427         126 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6428             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6429         126 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    6430             :                                               panDest2, nIters);
    6431         126 :                 return;
    6432             :             }
    6433             : #if !defined(__INTEL_CLANG_COMPILER)
    6434             :             // ICC autovectorizer doesn't do a good job, at least with icx
    6435             :             // 2022.1.0.20220316
    6436         195 :             else if (nComponents == 4)
    6437             :             {
    6438         195 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6439             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6440         195 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6441             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6442         195 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6443             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6444         195 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6445             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6446         195 :                 GUInt16 *CPL_RESTRICT panDest3 =
    6447             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    6448         195 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    6449             :                                               panDest2, panDest3, nIters);
    6450         195 :                 return;
    6451             :             }
    6452             : #endif
    6453             :         }
    6454             : #endif
    6455             :     }
    6456             : 
    6457          29 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    6458          29 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    6459         108 :     for (int iComp = 0; iComp < nComponents; iComp++)
    6460             :     {
    6461          79 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    6462          79 :                             iComp * nSourceDTSize,
    6463             :                         eSourceDT, nComponents * nSourceDTSize,
    6464          79 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    6465             :     }
    6466             : }
    6467             : 
    6468             : /************************************************************************/
    6469             : /*                    GDALTranspose2DSingleToSingle()                   */
    6470             : /************************************************************************/
    6471             : /**
    6472             :  * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
    6473             :  *
    6474             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6475             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6476             :  * @param nSrcWidth Width of pSrc array.
    6477             :  * @param nSrcHeight Height of pSrc array.
    6478             :  */
    6479             : 
    6480             : template <class DST, class SRC>
    6481         160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
    6482             :                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6483             :                                    size_t nSrcHeight)
    6484             : {
    6485         160 :     constexpr size_t blocksize = 32;
    6486         345 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6487             :     {
    6488         185 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6489        5016 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6490             :         {
    6491             :             // transpose the block beginning at [i,j]
    6492        4831 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6493       26185 :             for (size_t k = i; k < max_k; ++k)
    6494             :             {
    6495      669282 :                 for (size_t l = j; l < max_l; ++l)
    6496             :                 {
    6497      647928 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6498      647928 :                                  pDst[k + l * nSrcHeight]);
    6499             :                 }
    6500             :             }
    6501             :         }
    6502             :     }
    6503         160 : }
    6504             : 
    6505             : /************************************************************************/
    6506             : /*                   GDALTranspose2DComplexToComplex()                  */
    6507             : /************************************************************************/
    6508             : /**
    6509             :  * Transpose a 2D array of complex values into an array of complex values,
    6510             :  * in a efficient (cache-oblivious) way.
    6511             :  *
    6512             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6513             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6514             :  * @param nSrcWidth Width of pSrc array.
    6515             :  * @param nSrcHeight Height of pSrc array.
    6516             :  */
    6517             : template <class DST, class SRC>
    6518          25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
    6519             :                                      DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6520             :                                      size_t nSrcHeight)
    6521             : {
    6522          25 :     constexpr size_t blocksize = 32;
    6523          50 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6524             :     {
    6525          25 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6526          50 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6527             :         {
    6528             :             // transpose the block beginning at [i,j]
    6529          25 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6530          75 :             for (size_t k = i; k < max_k; ++k)
    6531             :             {
    6532         200 :                 for (size_t l = j; l < max_l; ++l)
    6533             :                 {
    6534         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6535         150 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6536         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
    6537         150 :                                  pDst[2 * (k + l * nSrcHeight) + 1]);
    6538             :                 }
    6539             :             }
    6540             :         }
    6541             :     }
    6542          25 : }
    6543             : 
    6544             : /************************************************************************/
    6545             : /*                   GDALTranspose2DComplexToSingle()                  */
    6546             : /************************************************************************/
    6547             : /**
    6548             :  * Transpose a 2D array of complex values into an array of non-complex values,
    6549             :  * in a efficient (cache-oblivious) way.
    6550             :  *
    6551             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6552             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6553             :  * @param nSrcWidth Width of pSrc array.
    6554             :  * @param nSrcHeight Height of pSrc array.
    6555             :  */
    6556             : template <class DST, class SRC>
    6557          55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
    6558             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6559             :                                     size_t nSrcHeight)
    6560             : {
    6561          55 :     constexpr size_t blocksize = 32;
    6562         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6563             :     {
    6564          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6565         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6566             :         {
    6567             :             // transpose the block beginning at [i,j]
    6568          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6569         165 :             for (size_t k = i; k < max_k; ++k)
    6570             :             {
    6571         440 :                 for (size_t l = j; l < max_l; ++l)
    6572             :                 {
    6573         330 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6574         330 :                                  pDst[k + l * nSrcHeight]);
    6575             :                 }
    6576             :             }
    6577             :         }
    6578             :     }
    6579          55 : }
    6580             : 
    6581             : /************************************************************************/
    6582             : /*                   GDALTranspose2DSingleToComplex()                  */
    6583             : /************************************************************************/
    6584             : /**
    6585             :  * Transpose a 2D array of non-complex values into an array of complex values,
    6586             :  * in a efficient (cache-oblivious) way.
    6587             :  *
    6588             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6589             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6590             :  * @param nSrcWidth Width of pSrc array.
    6591             :  * @param nSrcHeight Height of pSrc array.
    6592             :  */
    6593             : template <class DST, class SRC>
    6594          55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
    6595             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6596             :                                     size_t nSrcHeight)
    6597             : {
    6598          55 :     constexpr size_t blocksize = 32;
    6599         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6600             :     {
    6601          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6602         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6603             :         {
    6604             :             // transpose the block beginning at [i,j]
    6605          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6606         165 :             for (size_t k = i; k < max_k; ++k)
    6607             :             {
    6608         440 :                 for (size_t l = j; l < max_l; ++l)
    6609             :                 {
    6610         330 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6611         330 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6612         330 :                     pDst[2 * (k + l * nSrcHeight) + 1] = 0;
    6613             :                 }
    6614             :             }
    6615             :         }
    6616             :     }
    6617          55 : }
    6618             : 
    6619             : /************************************************************************/
    6620             : /*                        GDALTranspose2D()                             */
    6621             : /************************************************************************/
    6622             : 
    6623             : template <class DST, bool DST_IS_COMPLEX>
    6624         295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
    6625             :                             size_t nSrcWidth, size_t nSrcHeight)
    6626             : {
    6627             : #define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
    6628             :     do                                                                         \
    6629             :     {                                                                          \
    6630             :         if constexpr (DST_IS_COMPLEX)                                          \
    6631             :         {                                                                      \
    6632             :             GDALTranspose2DSingleToComplex(                                    \
    6633             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6634             :                 nSrcHeight);                                                   \
    6635             :         }                                                                      \
    6636             :         else                                                                   \
    6637             :         {                                                                      \
    6638             :             GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
    6639             :                                           pDst, nSrcWidth, nSrcHeight);        \
    6640             :         }                                                                      \
    6641             :     } while (0)
    6642             : 
    6643             : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
    6644             :     do                                                                         \
    6645             :     {                                                                          \
    6646             :         if constexpr (DST_IS_COMPLEX)                                          \
    6647             :         {                                                                      \
    6648             :             GDALTranspose2DComplexToComplex(                                   \
    6649             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6650             :                 nSrcHeight);                                                   \
    6651             :         }                                                                      \
    6652             :         else                                                                   \
    6653             :         {                                                                      \
    6654             :             GDALTranspose2DComplexToSingle(                                    \
    6655             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6656             :                 nSrcHeight);                                                   \
    6657             :         }                                                                      \
    6658             :     } while (0)
    6659             : 
    6660             :     // clang-format off
    6661         295 :     switch (eSrcType)
    6662             :     {
    6663          16 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t); break;
    6664          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
    6665          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
    6666          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
    6667          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
    6668          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
    6669          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
    6670          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
    6671          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
    6672          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
    6673          24 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
    6674          16 :         case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
    6675          16 :         case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
    6676          16 :         case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
    6677          16 :         case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
    6678          16 :         case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
    6679           0 :         case GDT_Unknown:
    6680             :         case GDT_TypeCount:
    6681           0 :             break;
    6682             :     }
    6683             :         // clang-format on
    6684             : 
    6685             : #undef CALL_GDALTranspose2D_internal
    6686             : #undef CALL_GDALTranspose2DComplex_internal
    6687         295 : }
    6688             : 
    6689             : /************************************************************************/
    6690             : /*                      GDALInterleave2Byte()                           */
    6691             : /************************************************************************/
    6692             : 
    6693             : #if defined(HAVE_SSE2) &&                                                      \
    6694             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6695             : 
    6696             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6697             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6698             : #if defined(__GNUC__)
    6699             : __attribute__((noinline))
    6700             : #endif
    6701             : static void
    6702             : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6703             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6704             : {
    6705             :     size_t i = 0;
    6706             :     constexpr size_t VALS_PER_ITER = 16;
    6707             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6708             :     {
    6709             :         __m128i xmm0 =
    6710             :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
    6711             :         __m128i xmm1 = _mm_loadu_si128(
    6712             :             reinterpret_cast<__m128i const *>(pSrc + i + nIters));
    6713             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
    6714             :                          _mm_unpacklo_epi8(xmm0, xmm1));
    6715             :         _mm_storeu_si128(
    6716             :             reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
    6717             :             _mm_unpackhi_epi8(xmm0, xmm1));
    6718             :     }
    6719             : #if defined(__clang__)
    6720             : #pragma clang loop vectorize(disable)
    6721             : #endif
    6722             :     for (; i < nIters; ++i)
    6723             :     {
    6724             :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6725             :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6726             :     }
    6727             : }
    6728             : 
    6729             : #else
    6730             : 
    6731             : #if defined(__GNUC__) && !defined(__clang__)
    6732             : __attribute__((optimize("tree-vectorize")))
    6733             : #endif
    6734             : #if defined(__GNUC__)
    6735             : __attribute__((noinline))
    6736             : #endif
    6737             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6738             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6739             : #pragma clang diagnostic push
    6740             : #pragma clang diagnostic ignored "-Wpass-failed"
    6741             : #endif
    6742             : static void
    6743           9 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6744             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6745             : {
    6746             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6747             : #pragma clang loop vectorize(enable)
    6748             : #endif
    6749      355429 :     for (size_t i = 0; i < nIters; ++i)
    6750             :     {
    6751      355420 :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6752      355420 :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6753             :     }
    6754           9 : }
    6755             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6756             : #pragma clang diagnostic pop
    6757             : #endif
    6758             : 
    6759             : #endif
    6760             : 
    6761             : /************************************************************************/
    6762             : /*                      GDALInterleave4Byte()                           */
    6763             : /************************************************************************/
    6764             : 
    6765             : #if defined(HAVE_SSE2) &&                                                      \
    6766             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6767             : 
    6768             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6769             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6770             : #if defined(__GNUC__)
    6771             : __attribute__((noinline))
    6772             : #endif
    6773             : static void
    6774             : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6775             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6776             : {
    6777             :     size_t i = 0;
    6778             :     constexpr size_t VALS_PER_ITER = 16;
    6779             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6780             :     {
    6781             :         __m128i xmm0 = _mm_loadu_si128(
    6782             :             reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
    6783             :         __m128i xmm1 = _mm_loadu_si128(
    6784             :             reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
    6785             :         __m128i xmm2 = _mm_loadu_si128(
    6786             :             reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
    6787             :         __m128i xmm3 = _mm_loadu_si128(
    6788             :             reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
    6789             :         auto tmp0 = _mm_unpacklo_epi8(
    6790             :             xmm0,
    6791             :             xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
    6792             :         auto tmp1 = _mm_unpackhi_epi8(
    6793             :             xmm0,
    6794             :             xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
    6795             :         auto tmp2 = _mm_unpacklo_epi8(
    6796             :             xmm2,
    6797             :             xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
    6798             :         auto tmp3 = _mm_unpackhi_epi8(
    6799             :             xmm2,
    6800             :             xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
    6801             :         auto tmp2_0 = _mm_unpacklo_epi16(
    6802             :             tmp0,
    6803             :             tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
    6804             :         auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
    6805             :         auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
    6806             :         auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
    6807             :         _mm_storeu_si128(
    6808             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
    6809             :             tmp2_0);
    6810             :         _mm_storeu_si128(
    6811             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
    6812             :             tmp2_1);
    6813             :         _mm_storeu_si128(
    6814             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
    6815             :             tmp2_2);
    6816             :         _mm_storeu_si128(
    6817             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
    6818             :             tmp2_3);
    6819             :     }
    6820             : #if defined(__clang__)
    6821             : #pragma clang loop vectorize(disable)
    6822             : #endif
    6823             :     for (; i < nIters; ++i)
    6824             :     {
    6825             :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    6826             :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    6827             :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    6828             :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    6829             :     }
    6830             : }
    6831             : 
    6832             : #else
    6833             : 
    6834             : #if defined(__GNUC__) && !defined(__clang__)
    6835             : __attribute__((optimize("tree-vectorize")))
    6836             : #endif
    6837             : #if defined(__GNUC__)
    6838             : __attribute__((noinline))
    6839             : #endif
    6840             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6841             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6842             : #pragma clang diagnostic push
    6843             : #pragma clang diagnostic ignored "-Wpass-failed"
    6844             : #endif
    6845             : static void
    6846           9 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6847             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6848             : {
    6849             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6850             : #pragma clang loop vectorize(enable)
    6851             : #endif
    6852       75443 :     for (size_t i = 0; i < nIters; ++i)
    6853             :     {
    6854       75434 :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    6855       75434 :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    6856       75434 :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    6857       75434 :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    6858             :     }
    6859           9 : }
    6860             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6861             : #pragma clang diagnostic pop
    6862             : #endif
    6863             : 
    6864             : #endif
    6865             : 
    6866             : /************************************************************************/
    6867             : /*                        GDALTranspose2D()                             */
    6868             : /************************************************************************/
    6869             : 
    6870             : /**
    6871             :  * Transpose a 2D array in a efficient (cache-oblivious) way.
    6872             :  *
    6873             :  * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
    6874             :  * @param eSrcType Data type of pSrc.
    6875             :  * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
    6876             :  * @param eDstType Data type of pDst.
    6877             :  * @param nSrcWidth Width of pSrc array.
    6878             :  * @param nSrcHeight Height of pSrc array.
    6879             :  * @since GDAL 3.11
    6880             :  */
    6881             : 
    6882         346 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
    6883             :                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
    6884             : {
    6885         346 :     if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
    6886             :     {
    6887          51 :         if (nSrcHeight == 2)
    6888             :         {
    6889           9 :             GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
    6890             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    6891           9 :             return;
    6892             :         }
    6893          42 :         if (nSrcHeight == 4)
    6894             :         {
    6895           9 :             GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
    6896             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    6897           9 :             return;
    6898             :         }
    6899             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
    6900             :      (defined(__x86_64) || defined(_M_X64)))
    6901          33 :         if (CPLHaveRuntimeSSSE3())
    6902             :         {
    6903          33 :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    6904             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    6905             :                                        nSrcHeight);
    6906          33 :             return;
    6907             :         }
    6908             : #elif defined(USE_NEON_OPTIMIZATIONS)
    6909             :         {
    6910             :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    6911             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    6912             :                                        nSrcHeight);
    6913             :             return;
    6914             :         }
    6915             : #endif
    6916             :     }
    6917             : 
    6918             : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
    6919             :     GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
    6920             :         pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
    6921             : 
    6922             :     // clang-format off
    6923         295 :     switch (eDstType)
    6924             :     {
    6925          15 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
    6926          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
    6927          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
    6928          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
    6929          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
    6930          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
    6931          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
    6932          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
    6933          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
    6934          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
    6935          25 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
    6936          16 :         case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
    6937          16 :         case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
    6938          16 :         case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
    6939          16 :         case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
    6940          16 :         case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
    6941           0 :         case GDT_Unknown:
    6942             :         case GDT_TypeCount:
    6943           0 :             break;
    6944             :     }
    6945             :         // clang-format on
    6946             : 
    6947             : #undef CALL_GDALTranspose2D_internal
    6948             : }
    6949             : 
    6950             : /************************************************************************/
    6951             : /*                     ExtractBitAndConvertTo255()                      */
    6952             : /************************************************************************/
    6953             : 
    6954             : #if defined(__GNUC__) || defined(_MSC_VER)
    6955             : // Signedness of char implementation dependent, so be explicit.
    6956             : // Assumes 2-complement integer types and sign extension of right shifting
    6957             : // GCC guarantees such:
    6958             : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
    6959      124890 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    6960             : {
    6961      124890 :     return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
    6962      124890 :                               7);
    6963             : }
    6964             : #else
    6965             : // Portable way
    6966             : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    6967             : {
    6968             :     return (byVal & (1 << nBit)) ? 255 : 0;
    6969             : }
    6970             : #endif
    6971             : 
    6972             : /************************************************************************/
    6973             : /*                   ExpandEightPackedBitsToByteAt255()                 */
    6974             : /************************************************************************/
    6975             : 
    6976       15569 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
    6977             :                                                     GByte abyOutput[8])
    6978             : {
    6979       15569 :     abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
    6980       15569 :     abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
    6981       15569 :     abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
    6982       15569 :     abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
    6983       15569 :     abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
    6984       15569 :     abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
    6985       15569 :     abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
    6986       15569 :     abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
    6987       15569 : }
    6988             : 
    6989             : /************************************************************************/
    6990             : /*                GDALExpandPackedBitsToByteAt0Or255()                  */
    6991             : /************************************************************************/
    6992             : 
    6993             : /** Expand packed-bits (ordered from most-significant bit to least one)
    6994             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    6995             :   at 1 to a byte at 255.
    6996             : 
    6997             :  The function does (in a possibly more optimized way) the following:
    6998             :  \code{.cpp}
    6999             :  for (size_t i = 0; i < nInputBits; ++i )
    7000             :  {
    7001             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
    7002             :  }
    7003             :  \endcode
    7004             : 
    7005             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7006             :  @param pabyOutput Output array of nInputBits bytes.
    7007             :  @param nInputBits Number of valid bits in pabyInput.
    7008             : 
    7009             :  @since 3.11
    7010             : */
    7011             : 
    7012       45145 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
    7013             :                                         GByte *CPL_RESTRICT pabyOutput,
    7014             :                                         size_t nInputBits)
    7015             : {
    7016       45145 :     const size_t nInputWholeBytes = nInputBits / 8;
    7017       45145 :     size_t iByte = 0;
    7018             : 
    7019             : #ifdef HAVE_SSE2
    7020             :     // Mask to isolate each bit
    7021       45145 :     const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
    7022             :                                           8, 16, 32, 64, -128);
    7023       45145 :     const __m128i zero = _mm_setzero_si128();
    7024       45145 :     const __m128i all_ones = _mm_set1_epi8(-1);
    7025             : #ifdef __SSSE3__
    7026             :     const __m128i dispatch_two_bytes =
    7027             :         _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
    7028             : #endif
    7029       45145 :     constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
    7030      135654 :     for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
    7031             :     {
    7032       90509 :         __m128i reg_ori = _mm_loadu_si128(
    7033       90509 :             reinterpret_cast<const __m128i *>(pabyInput + iByte));
    7034             : 
    7035       90509 :         constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
    7036      814581 :         for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
    7037             :         {
    7038             :             // Given reg_ori = (A, B, ... 14 other bytes ...),
    7039             :             // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
    7040             : #ifdef __SSSE3__
    7041             :             __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
    7042             : #else
    7043      724072 :             __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
    7044      724072 :             reg = _mm_unpacklo_epi16(reg, reg);
    7045      724072 :             reg = _mm_unpacklo_epi32(reg, reg);
    7046             : #endif
    7047             : 
    7048             :             // Test if bits of interest are set
    7049      724072 :             reg = _mm_and_si128(reg, bit_mask);
    7050             : 
    7051             :             // Now test if those bits are set, by comparing to zero. So the
    7052             :             // result will be that bytes where bits are set will be at 0, and
    7053             :             // ones where they are cleared will be at 0xFF. So the inverse of
    7054             :             // the end result we want!
    7055      724072 :             reg = _mm_cmpeq_epi8(reg, zero);
    7056             : 
    7057             :             // Invert the result
    7058      724072 :             reg = _mm_andnot_si128(reg, all_ones);
    7059             : 
    7060             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
    7061             : 
    7062      724072 :             pabyOutput += SSE_REG_SIZE;
    7063             : 
    7064             :             // Right-shift of 2 bytes
    7065      724072 :             reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
    7066             :         }
    7067             :     }
    7068             : 
    7069             : #endif  // HAVE_SSE2
    7070             : 
    7071       60714 :     for (; iByte < nInputWholeBytes; ++iByte)
    7072             :     {
    7073       15569 :         ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
    7074       15569 :         pabyOutput += 8;
    7075             :     }
    7076       45483 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7077             :     {
    7078         338 :         *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
    7079         338 :         ++pabyOutput;
    7080             :     }
    7081       45145 : }
    7082             : 
    7083             : /************************************************************************/
    7084             : /*                   ExpandEightPackedBitsToByteAt1()                   */
    7085             : /************************************************************************/
    7086             : 
    7087      136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
    7088             :                                                   GByte abyOutput[8])
    7089             : {
    7090      136113 :     abyOutput[0] = (byVal >> 7) & 0x1;
    7091      136113 :     abyOutput[1] = (byVal >> 6) & 0x1;
    7092      136113 :     abyOutput[2] = (byVal >> 5) & 0x1;
    7093      136113 :     abyOutput[3] = (byVal >> 4) & 0x1;
    7094      136113 :     abyOutput[4] = (byVal >> 3) & 0x1;
    7095      136113 :     abyOutput[5] = (byVal >> 2) & 0x1;
    7096      136113 :     abyOutput[6] = (byVal >> 1) & 0x1;
    7097      136113 :     abyOutput[7] = (byVal >> 0) & 0x1;
    7098      136113 : }
    7099             : 
    7100             : /************************************************************************/
    7101             : /*                GDALExpandPackedBitsToByteAt0Or1()                    */
    7102             : /************************************************************************/
    7103             : 
    7104             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7105             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7106             :   at 1 to a byte at 1.
    7107             : 
    7108             :  The function does (in a possibly more optimized way) the following:
    7109             :  \code{.cpp}
    7110             :  for (size_t i = 0; i < nInputBits; ++i )
    7111             :  {
    7112             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
    7113             :  }
    7114             :  \endcode
    7115             : 
    7116             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7117             :  @param pabyOutput Output array of nInputBits bytes.
    7118             :  @param nInputBits Number of valid bits in pabyInput.
    7119             : 
    7120             :  @since 3.11
    7121             : */
    7122             : 
    7123        7041 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
    7124             :                                       GByte *CPL_RESTRICT pabyOutput,
    7125             :                                       size_t nInputBits)
    7126             : {
    7127        7041 :     const size_t nInputWholeBytes = nInputBits / 8;
    7128        7041 :     size_t iByte = 0;
    7129      143154 :     for (; iByte < nInputWholeBytes; ++iByte)
    7130             :     {
    7131      136113 :         ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
    7132      136113 :         pabyOutput += 8;
    7133             :     }
    7134       18902 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7135             :     {
    7136       11861 :         *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
    7137       11861 :         ++pabyOutput;
    7138             :     }
    7139        7041 : }

Generated by: LCOV version 1.14