LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2730 2987 91.4 %
Date: 2026-05-07 23:23:29 Functions: 690 730 94.5 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_float.h"
      36             : #include "cpl_progress.h"
      37             : #include "cpl_string.h"
      38             : #include "cpl_vsi.h"
      39             : #include "gdal_priv_templates.hpp"
      40             : #include "gdal_vrt.h"
      41             : #include "gdalwarper.h"
      42             : #include "memdataset.h"
      43             : #include "vrtdataset.h"
      44             : 
      45             : #if defined(__x86_64) || defined(_M_X64)
      46             : #include <emmintrin.h>
      47             : #include <immintrin.h>
      48             : #define HAVE_SSE2
      49             : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
      50             : #if (defined(__GNUC__) || defined(__clang__)) &&                               \
      51             :     defined(HAVE_AVX2_AT_COMPILE_TIME)
      52             : #define HAVE_AVX2_DISPATCH
      53             : #elif defined(_MSC_VER)
      54             : #include <intrin.h>
      55             : #define HAVE_AVX2_DISPATCH
      56             : #endif
      57             : #elif defined(USE_NEON_OPTIMIZATIONS)
      58             : #include "include_sse2neon.h"
      59             : #define HAVE_SSE2
      60             : #endif
      61             : 
      62             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      63             : #include "rasterio_ssse3.h"
      64             : #ifdef __SSSE3__
      65             : #include <tmmintrin.h>
      66             : #endif
      67             : #endif
      68             : 
      69             : #ifdef __SSE4_1__
      70             : #include <smmintrin.h>
      71             : #endif
      72             : 
      73             : #ifdef __GNUC__
      74             : #define CPL_NOINLINE __attribute__((noinline))
      75             : #else
      76             : #define CPL_NOINLINE
      77             : #endif
      78             : 
      79             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      80             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      81             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      82             : 
      83             : /************************************************************************/
      84             : /*                     DownsamplingIntegerXFactor()                     */
      85             : /************************************************************************/
      86             : 
      87             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      88      695850 : static bool DownsamplingIntegerXFactor(
      89             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      90             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      91             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      92             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      93             : {
      94      695850 :     const int nBandDataSize =
      95             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      96      695850 :     int nOuterLoopIters = nBufXSize - 1;
      97      695850 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      98             :     const GByte *CPL_RESTRICT pabySrcData;
      99      695850 :     int nEndBlockX = nBlockXSize + nStartBlockX;
     100             : 
     101      695850 :     if (iSrcX < nEndBlockX)
     102             :     {
     103      295062 :         CPLAssert(poBlock);
     104      295062 :         goto no_reload_block;
     105             :     }
     106      400788 :     goto reload_block;
     107             : 
     108             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
     109             :     // nRasterXSize - 1
     110     1265113 :     while (--nOuterLoopIters >= 1)
     111             :     {
     112      201834 :         iSrcX += nSrcXInc;
     113      201834 :         pabySrcData += nIncSrcOffset;
     114      201834 :         pabyDstData += nPixelSpace;
     115             : 
     116             :         /* --------------------------------------------------------------------
     117             :          */
     118             :         /*      Ensure we have the appropriate block loaded. */
     119             :         /* --------------------------------------------------------------------
     120             :          */
     121      201834 :         if (iSrcX >= nEndBlockX)
     122             :         {
     123      201834 :         reload_block:
     124             :         {
     125      615212 :             const int nLBlockX = iSrcX / nBlockXSize;
     126      615212 :             nStartBlockX = nLBlockX * nBlockXSize;
     127      615212 :             nEndBlockX = nStartBlockX + nBlockXSize;
     128             : 
     129      615212 :             if (poBlock != nullptr)
     130      341376 :                 poBlock->DropLock();
     131             : 
     132      615212 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     133      615212 :             if (poBlock == nullptr)
     134             :             {
     135           1 :                 return false;
     136             :             }
     137             :         }
     138             : 
     139      615211 :         no_reload_block:
     140             :             const GByte *pabySrcBlock =
     141     1265113 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     142     1265113 :             GPtrDiff_t iSrcOffset =
     143     1265113 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     144     1265113 :             pabySrcData = pabySrcBlock + iSrcOffset;
     145             :         }
     146             : 
     147             :         /* --------------------------------------------------------------------
     148             :          */
     149             :         /*      Copy the maximum run of pixels. */
     150             :         /* --------------------------------------------------------------------
     151             :          */
     152             : 
     153     1265113 :         const int nIters = std::min(
     154     1265113 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     155             :         if (bSameDataType)
     156             :         {
     157     1264670 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     158     1264670 :             if (nIters > 1)
     159             :             {
     160             :                 if (DATA_TYPE_SIZE == 1)
     161             :                 {
     162      326320 :                     pabySrcData += nIncSrcOffset;
     163      326320 :                     pabyDstData += nPixelSpace;
     164      326320 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     165      326320 :                                      nPixelSpace, nIters - 1);
     166      326320 :                     pabySrcData +=
     167      326320 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     168      326320 :                     pabyDstData +=
     169      326320 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     170             :                 }
     171             :                 else
     172             :                 {
     173     4395716 :                     for (int i = 0; i < nIters - 1; i++)
     174             :                     {
     175     4197550 :                         pabySrcData += nIncSrcOffset;
     176     4197550 :                         pabyDstData += nPixelSpace;
     177     4197550 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     178             :                     }
     179             :                 }
     180      524490 :                 iSrcX += nSrcXInc * (nIters - 1);
     181      524490 :                 nOuterLoopIters -= nIters - 1;
     182             :             }
     183             :         }
     184             :         else
     185             :         {
     186             :             // Type to type conversion ...
     187         443 :             GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     188         443 :                             eBufType, nPixelSpace, std::max(1, nIters));
     189         443 :             if (nIters > 1)
     190             :             {
     191         216 :                 pabySrcData +=
     192         216 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     193         216 :                 pabyDstData +=
     194         216 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     195         216 :                 iSrcX += nSrcXInc * (nIters - 1);
     196         216 :                 nOuterLoopIters -= nIters - 1;
     197             :             }
     198             :         }
     199             :     }
     200             : 
     201             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     202     1063279 :     if (nOuterLoopIters == 0)
     203             :     {
     204      367430 :         const int nRasterXSize = poBand->GetXSize();
     205      367430 :         iSrcX =
     206      734860 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     207      367430 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     208      367430 :         pabyDstData += nPixelSpace;
     209      367430 :         if (iSrcX < nEndBlockX)
     210             :         {
     211      354840 :             goto no_reload_block;
     212             :         }
     213       12590 :         goto reload_block;
     214             :     }
     215      695849 :     return true;
     216             : }
     217             : 
     218             : template <class A, class B>
     219     2818770 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
     220             : {
     221     2818770 :     return a * b;
     222             : }
     223             : 
     224             : /************************************************************************/
     225             : /*                             IRasterIO()                              */
     226             : /*                                                                      */
     227             : /*      Default internal implementation of RasterIO() ... utilizes      */
     228             : /*      the Block access methods to satisfy the request.  This would    */
     229             : /*      normally only be overridden by formats with overviews.          */
     230             : /************************************************************************/
     231             : 
     232     6180720 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     233             :                                  int nXSize, int nYSize, void *pData,
     234             :                                  int nBufXSize, int nBufYSize,
     235             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     236             :                                  GSpacing nLineSpace,
     237             :                                  GDALRasterIOExtraArg *psExtraArg)
     238             : 
     239             : {
     240     6180720 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     241             :     {
     242           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     243             :                  "An error occurred while writing a dirty block "
     244             :                  "from GDALRasterBand::IRasterIO");
     245           0 :         CPLErr eErr = eFlushBlockErr;
     246           0 :         eFlushBlockErr = CE_None;
     247           0 :         return eErr;
     248             :     }
     249     6180720 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     250             :     {
     251           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     252           0 :         return CE_Failure;
     253             :     }
     254             : 
     255     6180720 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     256     6180720 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     257     6180720 :     GByte dummyBlock[2] = {0, 0};
     258     6180720 :     GByte *pabySrcBlock =
     259             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     260     6180720 :     GDALRasterBlock *poBlock = nullptr;
     261     6180720 :     const bool bUseIntegerRequestCoords =
     262     6545670 :         (!psExtraArg->bFloatingPointWindowValidity ||
     263      364948 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     264      340016 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     265             : 
     266             :     /* ==================================================================== */
     267             :     /*      A common case is the data requested with the destination        */
     268             :     /*      is packed, and the block width is the raster width.             */
     269             :     /* ==================================================================== */
     270     6088900 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     271     3234430 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     272    12269600 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     273             :     {
     274     3096630 :         CPLErr eErr = CE_None;
     275     3096630 :         int nLBlockY = -1;
     276             : 
     277     9751410 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     278             :         {
     279     6655860 :             const int iSrcY = iBufYOff + nYOff;
     280             : 
     281     6655860 :             if (iSrcY < nLBlockY * nBlockYSize ||
     282     6655860 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     283             :             {
     284     3365160 :                 nLBlockY = iSrcY / nBlockYSize;
     285     3365160 :                 bool bJustInitialize =
     286      297355 :                     eRWFlag == GF_Write && nXOff == 0 &&
     287     3720440 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     288       57921 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     289             : 
     290             :                 // Is this a partial tile at right and/or bottom edges of
     291             :                 // the raster, and that is going to be completely written?
     292             :                 // If so, do not load it from storage, but zero it so that
     293             :                 // the content outsize of the validity area is initialized.
     294     3365160 :                 bool bMemZeroBuffer = false;
     295      297355 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     296       24978 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     297     3662610 :                     nYOff + nYSize == GetYSize() &&
     298          90 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     299             :                 {
     300          90 :                     bJustInitialize = true;
     301          90 :                     bMemZeroBuffer = true;
     302             :                 }
     303             : 
     304     3365160 :                 if (poBlock)
     305      268533 :                     poBlock->DropLock();
     306             : 
     307     3365160 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     308     3365160 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     309     3365160 :                 if (poBlock == nullptr)
     310             :                 {
     311        1078 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     312             :                         nullptr)
     313             :                     {
     314           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     315             :                                  "GetBlockRef failed at X block offset %d, "
     316             :                                  "Y block offset %d%s",
     317             :                                  0, nLBlockY,
     318           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     319           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     320             :                                      : "");
     321             :                     }
     322        1078 :                     eErr = CE_Failure;
     323        1078 :                     break;
     324             :                 }
     325             : 
     326     3364090 :                 if (eRWFlag == GF_Write)
     327      297355 :                     poBlock->MarkDirty();
     328             : 
     329     3364090 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     330     3364090 :                 if (bMemZeroBuffer)
     331             :                 {
     332          90 :                     memset(pabySrcBlock, 0,
     333          90 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     334          90 :                                nBlockXSize * nBlockYSize);
     335             :                 }
     336             :             }
     337             : 
     338     6654780 :             const auto nSrcByteOffset =
     339     6654780 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     340     6654780 :                      nBlockXSize +
     341     6654780 :                  nXOff) *
     342     6654780 :                 nBandDataSize;
     343             : 
     344     6654780 :             if (eDataType == eBufType)
     345             :             {
     346     2991080 :                 if (eRWFlag == GF_Read)
     347     2518500 :                     memcpy(static_cast<GByte *>(pData) +
     348     2518500 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     349     2518500 :                            pabySrcBlock + nSrcByteOffset,
     350             :                            static_cast<size_t>(nLineSpace));
     351             :                 else
     352      472580 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     353      472580 :                            static_cast<GByte *>(pData) +
     354      472580 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     355             :                            static_cast<size_t>(nLineSpace));
     356             :             }
     357             :             else
     358             :             {
     359             :                 // Type to type conversion.
     360     3663710 :                 if (eRWFlag == GF_Read)
     361     3641640 :                     GDALCopyWords64(
     362     3641640 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     363             :                         static_cast<GByte *>(pData) +
     364     3641640 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     365             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     366             :                 else
     367       22065 :                     GDALCopyWords64(static_cast<GByte *>(pData) +
     368       22065 :                                         static_cast<GPtrDiff_t>(iBufYOff) *
     369             :                                             nLineSpace,
     370             :                                     eBufType, static_cast<int>(nPixelSpace),
     371       22065 :                                     pabySrcBlock + nSrcByteOffset, eDataType,
     372             :                                     nBandDataSize, nBufXSize);
     373             :             }
     374             : 
     375     6742690 :             if (psExtraArg->pfnProgress != nullptr &&
     376       87908 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     377             :                                          psExtraArg->pProgressData))
     378             :             {
     379           5 :                 eErr = CE_Failure;
     380           5 :                 break;
     381             :             }
     382             :         }
     383             : 
     384     3096630 :         if (poBlock)
     385     3095550 :             poBlock->DropLock();
     386             : 
     387     3096630 :         return eErr;
     388             :     }
     389             : 
     390             :     /* ==================================================================== */
     391             :     /*      Do we have overviews that would be appropriate to satisfy       */
     392             :     /*      this request?                                                   */
     393             :     /* ==================================================================== */
     394     3084090 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     395             :         eRWFlag == GF_Read)
     396             :     {
     397             :         GDALRasterIOExtraArg sExtraArg;
     398        2967 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     399             : 
     400             :         const int nOverview =
     401        2967 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     402             :                                           nBufXSize, nBufYSize, &sExtraArg);
     403        2967 :         if (nOverview >= 0)
     404             :         {
     405        2892 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     406        2892 :             if (poOverviewBand == nullptr)
     407        2892 :                 return CE_Failure;
     408             : 
     409        2892 :             return poOverviewBand->RasterIO(
     410             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     411        2892 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     412             :         }
     413             :     }
     414             : 
     415      891712 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     416           6 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     417     3972910 :         nLineSpace == nPixelSpace * nBufXSize &&
     418           6 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     419             :     {
     420           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     421           0 :         return CE_None;
     422             :     }
     423             : 
     424             :     /* ==================================================================== */
     425             :     /*      The second case when we don't need subsample data but likely    */
     426             :     /*      need data type conversion.                                      */
     427             :     /* ==================================================================== */
     428     3081190 :     if (  // nPixelSpace == nBufDataSize &&
     429     3081190 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     430             :     {
     431             : #if DEBUG_VERBOSE
     432             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     433             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     434             : #endif
     435             : 
     436             :         /* --------------------------------------------------------------------
     437             :          */
     438             :         /*      Loop over buffer computing source locations. */
     439             :         /* --------------------------------------------------------------------
     440             :          */
     441             :         // Calculate starting values out of loop
     442     2503280 :         const int nLBlockXStart = nXOff / nBlockXSize;
     443     2503280 :         const int nXSpanEnd = nBufXSize + nXOff;
     444             : 
     445     2503280 :         int iBufYOff = 0;
     446     2503280 :         int iSrcY = nYOff;
     447             :         while (true)
     448             :         {
     449     2544130 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     450             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     451     2544130 :             int nLBlockY = iSrcY / nBlockYSize;
     452     2544130 :             int nLBlockX = nLBlockXStart;
     453     2544130 :             int iSrcX = nXOff;
     454     5362820 :             while (iSrcX < nXSpanEnd)
     455             :             {
     456     2818770 :                 int nXSpan = nLBlockX * nBlockXSize;
     457     2818770 :                 if (nXSpan < INT_MAX - nBlockXSize)
     458     2818770 :                     nXSpan += nBlockXSize;
     459             :                 else
     460           0 :                     nXSpan = INT_MAX;
     461     2818770 :                 const int nXRight = nXSpan;
     462     2818770 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     463             : 
     464             :                 const size_t nXSpanSize =
     465     2818770 :                     CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
     466             : 
     467     2818770 :                 bool bJustInitialize =
     468     2042970 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     469       38035 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     470     4888110 :                     nXOff <= nLBlockX * nBlockXSize &&
     471       26364 :                     nXOff + nXSize >= nXRight;
     472             : 
     473             :                 // Is this a partial tile at right and/or bottom edges of
     474             :                 // the raster, and that is going to be completely written?
     475             :                 // If so, do not load it from storage, but zero it so that
     476             :                 // the content outsize of the validity area is initialized.
     477     2818770 :                 bool bMemZeroBuffer = false;
     478     2042970 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     479     2017850 :                     nXOff <= nLBlockX * nBlockXSize &&
     480     2016190 :                     nYOff <= nLBlockY * nBlockYSize &&
     481       12145 :                     (nXOff + nXSize >= nXRight ||
     482             :                      // cppcheck-suppress knownConditionTrueFalse
     483     4864460 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     484       11965 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     485       10743 :                      (nYOff + nYSize == GetYSize() &&
     486        1951 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     487             :                 {
     488        3173 :                     bJustInitialize = true;
     489        3173 :                     bMemZeroBuffer = true;
     490             :                 }
     491             : 
     492             :                 /* --------------------------------------------------------------------
     493             :                  */
     494             :                 /*      Ensure we have the appropriate block loaded. */
     495             :                 /* --------------------------------------------------------------------
     496             :                  */
     497     2818770 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     498     2818770 :                 poBlock =
     499     2818770 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     500     2818770 :                 if (!poBlock)
     501             :                 {
     502          73 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     503             :                         nullptr)
     504             :                     {
     505           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     506             :                                  "GetBlockRef failed at X block offset %d, "
     507             :                                  "Y block offset %d%s",
     508             :                                  nLBlockX, nLBlockY,
     509           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     510           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     511             :                                      : "");
     512             :                     }
     513          73 :                     return (CE_Failure);
     514             :                 }
     515             : 
     516     2818700 :                 if (eRWFlag == GF_Write)
     517     2042970 :                     poBlock->MarkDirty();
     518             : 
     519     2818700 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     520     2818700 :                 if (bMemZeroBuffer)
     521             :                 {
     522        3173 :                     memset(pabySrcBlock, 0,
     523        3173 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     524        3173 :                                nBlockXSize * nBlockYSize);
     525             :                 }
     526             :                 /* --------------------------------------------------------------------
     527             :                  */
     528             :                 /*      Copy over this chunk of data. */
     529             :                 /* --------------------------------------------------------------------
     530             :                  */
     531     2818700 :                 GPtrDiff_t iSrcOffset =
     532     2818700 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     533     2818700 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     534     2818700 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     535     2818700 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     536     2818700 :                          nBlockXSize) *
     537     2818700 :                     nBandDataSize;
     538             :                 // Fill up as many rows as possible for the loaded block.
     539     5637390 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     540     2818700 :                                           nBufYSize - iBufYOff);
     541    60991500 :                 for (int k = 0; k < kmax; k++)
     542             :                 {
     543    58172800 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     544             :                     {
     545    53770900 :                         if (eRWFlag == GF_Read)
     546    49332700 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     547    49332700 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     548    49332700 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     549             :                         else
     550     4438130 :                             memcpy(pabySrcBlock + iSrcOffset,
     551     4438130 :                                    static_cast<GByte *>(pData) + iBufOffset +
     552     4438130 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     553             :                                    nXSpanSize);
     554             :                     }
     555             :                     else
     556             :                     {
     557             :                         /* type to type conversion */
     558     4401910 :                         if (eRWFlag == GF_Read)
     559     4251700 :                             GDALCopyWords64(
     560     4251700 :                                 pabySrcBlock + iSrcOffset, eDataType,
     561             :                                 nBandDataSize,
     562     4251700 :                                 static_cast<GByte *>(pData) + iBufOffset +
     563     4251700 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     564             :                                 eBufType, static_cast<int>(nPixelSpace),
     565             :                                 nXSpan);
     566             :                         else
     567      150209 :                             GDALCopyWords64(
     568      150209 :                                 static_cast<GByte *>(pData) + iBufOffset +
     569      150209 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     570             :                                 eBufType, static_cast<int>(nPixelSpace),
     571      150209 :                                 pabySrcBlock + iSrcOffset, eDataType,
     572             :                                 nBandDataSize, nXSpan);
     573             :                     }
     574             : 
     575    58172800 :                     iSrcOffset +=
     576    58172800 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     577             :                 }
     578             : 
     579             :                 iBufOffset =
     580     2818700 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     581     2818700 :                 nLBlockX++;
     582     2818700 :                 iSrcX += nXSpan;
     583             : 
     584     2818700 :                 poBlock->DropLock();
     585     2818700 :                 poBlock = nullptr;
     586             :             }
     587             : 
     588             :             /* Compute the increment to go on a block boundary */
     589     2544050 :             const int nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     590             : 
     591     2545940 :             if (psExtraArg->pfnProgress != nullptr &&
     592        1884 :                 !psExtraArg->pfnProgress(
     593     2545940 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     594             :                     psExtraArg->pProgressData))
     595             :             {
     596           0 :                 return CE_Failure;
     597             :             }
     598             : 
     599     2544050 :             iBufYOff += nYInc;
     600     2544050 :             if (iBufYOff >= nBufYSize)
     601     2503210 :                 break;
     602             :             // Only increment iSrcY after above loop end check, to avoid
     603             :             // potential int overflow.
     604       40846 :             iSrcY += nYInc;
     605       40846 :         }
     606             : 
     607     2503210 :         return CE_None;
     608             :     }
     609             : 
     610             :     /* ==================================================================== */
     611             :     /*      Loop reading required source blocks to satisfy output           */
     612             :     /*      request.  This is the most general implementation.              */
     613             :     /* ==================================================================== */
     614             : 
     615      577913 :     double dfXOff = nXOff;
     616      577913 :     double dfYOff = nYOff;
     617      577913 :     double dfXSize = nXSize;
     618      577913 :     double dfYSize = nYSize;
     619      577913 :     if (psExtraArg->bFloatingPointWindowValidity)
     620             :     {
     621      242956 :         dfXOff = psExtraArg->dfXOff;
     622      242956 :         dfYOff = psExtraArg->dfYOff;
     623      242956 :         dfXSize = psExtraArg->dfXSize;
     624      242956 :         dfYSize = psExtraArg->dfYSize;
     625             :     }
     626             : 
     627             :     /* -------------------------------------------------------------------- */
     628             :     /*      Compute stepping increment.                                     */
     629             :     /* -------------------------------------------------------------------- */
     630      577913 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     631      577913 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     632      577913 :     CPLErr eErr = CE_None;
     633             : 
     634      577913 :     if (eRWFlag == GF_Write)
     635             :     {
     636             :         /* --------------------------------------------------------------------
     637             :          */
     638             :         /*    Write case */
     639             :         /*    Loop over raster window computing source locations in the buffer.
     640             :          */
     641             :         /* --------------------------------------------------------------------
     642             :          */
     643      166655 :         GByte *pabyDstBlock = nullptr;
     644      166655 :         int nLBlockX = -1;
     645      166655 :         int nLBlockY = -1;
     646             : 
     647     1260010 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     648             :         {
     649     1093360 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     650             : 
     651    12384200 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     652             :             {
     653    11290800 :                 const int iBufXOff =
     654    11290800 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     655    11290800 :                 GPtrDiff_t iBufOffset =
     656    11290800 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     657             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     658    11290800 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     659             : 
     660             :                 // FIXME: this code likely doesn't work if the dirty block gets
     661             :                 // flushed to disk before being completely written.
     662             :                 // In the meantime, bJustInitialize should probably be set to
     663             :                 // FALSE even if it is not ideal performance wise, and for
     664             :                 // lossy compression.
     665             : 
     666             :                 /* --------------------------------------------------------------------
     667             :                  */
     668             :                 /*      Ensure we have the appropriate block loaded. */
     669             :                 /* --------------------------------------------------------------------
     670             :                  */
     671    11290800 :                 if (iDstX < nLBlockX * nBlockXSize ||
     672    11041500 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     673    10584800 :                     iDstY < nLBlockY * nBlockYSize ||
     674    10584800 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     675             :                 {
     676      738702 :                     nLBlockX = iDstX / nBlockXSize;
     677      738702 :                     nLBlockY = iDstY / nBlockYSize;
     678             : 
     679      738702 :                     const bool bJustInitialize =
     680     1065990 :                         nYOff <= nLBlockY * nBlockYSize &&
     681      327291 :                         nYOff + nYSize - nBlockYSize >=
     682      327291 :                             nLBlockY * nBlockYSize &&
     683     1116320 :                         nXOff <= nLBlockX * nBlockXSize &&
     684       50325 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     685             :                     /*bool bMemZeroBuffer = FALSE;
     686             :                     if( !bJustInitialize &&
     687             :                         nXOff <= nLBlockX * nBlockXSize &&
     688             :                         nYOff <= nLBlockY * nBlockYSize &&
     689             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     690             :                          (nXOff + nXSize == GetXSize() &&
     691             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     692             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     693             :                          (nYOff + nYSize == GetYSize() &&
     694             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     695             :                     {
     696             :                         bJustInitialize = TRUE;
     697             :                         bMemZeroBuffer = TRUE;
     698             :                     }*/
     699      738702 :                     if (poBlock != nullptr)
     700      572047 :                         poBlock->DropLock();
     701             : 
     702      738702 :                     poBlock =
     703      738702 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     704      738702 :                     if (poBlock == nullptr)
     705             :                     {
     706           0 :                         return (CE_Failure);
     707             :                     }
     708             : 
     709      738702 :                     poBlock->MarkDirty();
     710             : 
     711      738702 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     712             :                     /*if( bMemZeroBuffer )
     713             :                     {
     714             :                         memset(pabyDstBlock, 0,
     715             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     716             :                     * nBlockYSize);
     717             :                     }*/
     718             :                 }
     719             : 
     720             :                 // To make Coverity happy. Should not happen by design.
     721    11290800 :                 if (pabyDstBlock == nullptr)
     722             :                 {
     723           0 :                     CPLAssert(false);
     724             :                     eErr = CE_Failure;
     725             :                     break;
     726             :                 }
     727             : 
     728             :                 /* --------------------------------------------------------------------
     729             :                  */
     730             :                 /*      Copy over this pixel of data. */
     731             :                 /* --------------------------------------------------------------------
     732             :                  */
     733    11290800 :                 GPtrDiff_t iDstOffset =
     734    11290800 :                     (static_cast<GPtrDiff_t>(iDstX) -
     735    11290800 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     736    11290800 :                      (static_cast<GPtrDiff_t>(iDstY) -
     737    11290800 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     738    11290800 :                          nBlockXSize) *
     739    11290800 :                     nBandDataSize;
     740             : 
     741    11290800 :                 if (eDataType == eBufType)
     742             :                 {
     743    11287700 :                     memcpy(pabyDstBlock + iDstOffset,
     744    11287700 :                            static_cast<GByte *>(pData) + iBufOffset,
     745             :                            nBandDataSize);
     746             :                 }
     747             :                 else
     748             :                 {
     749             :                     /* type to type conversion ... ouch, this is expensive way
     750             :                     of handling single words */
     751        3096 :                     GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
     752        3096 :                                     eBufType, 0, pabyDstBlock + iDstOffset,
     753             :                                     eDataType, 0, 1);
     754             :                 }
     755             :             }
     756             : 
     757     1093360 :             if (psExtraArg->pfnProgress != nullptr &&
     758           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     759             :                                          psExtraArg->pProgressData))
     760             :             {
     761           0 :                 eErr = CE_Failure;
     762           0 :                 break;
     763             :             }
     764             :         }
     765             :     }
     766             :     else
     767             :     {
     768      411258 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     769             :         {
     770       42075 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     771       13559 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     772       13506 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     773       28563 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     774        3224 :                 GetColorTable() != nullptr)
     775             :             {
     776           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     777             :                          "Resampling method not supported on paletted band. "
     778             :                          "Falling back to nearest neighbour");
     779             :             }
     780       14261 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     781           3 :                      GDALDataTypeIsComplex(eDataType))
     782             :             {
     783           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     784             :                          "Resampling method not supported on complex data type "
     785             :                          "band. Falling back to nearest neighbour");
     786             :             }
     787             :             else
     788             :             {
     789       14258 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     790             :                                          pData, nBufXSize, nBufYSize, eBufType,
     791       14258 :                                          nPixelSpace, nLineSpace, psExtraArg);
     792             :             }
     793             :         }
     794             : 
     795      397000 :         int nLimitBlockY = 0;
     796      397000 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     797      397000 :         int nStartBlockX = -nBlockXSize;
     798      397000 :         constexpr double EPS = 1e-10;
     799      397000 :         int nLBlockY = -1;
     800      397000 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     801      397000 :         const bool bIntegerXFactor =
     802      372767 :             bUseIntegerRequestCoords &&
     803      670836 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     804      273836 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     805             : 
     806             :         /* --------------------------------------------------------------------
     807             :          */
     808             :         /*      Read case */
     809             :         /*      Loop over buffer computing source locations. */
     810             :         /* --------------------------------------------------------------------
     811             :          */
     812     2367100 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     813             :         {
     814             :             // Add small epsilon to avoid some numeric precision issues.
     815     1970110 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     816     1970110 :             const int iSrcY = static_cast<int>(std::min(
     817     1970110 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     818             : 
     819     1970110 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     820             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     821             : 
     822     1970110 :             if (iSrcY >= nLimitBlockY)
     823             :             {
     824      438018 :                 nLBlockY = iSrcY / nBlockYSize;
     825      438018 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     826      438018 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     827      438018 :                     nLimitBlockY += nBlockYSize;
     828             :                 else
     829           0 :                     nLimitBlockY = INT_MAX;
     830             :                 // Make sure a new block is loaded.
     831      438018 :                 nStartBlockX = -nBlockXSize;
     832             :             }
     833     1532090 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     834             :             {
     835             :                 // Make sure a new block is loaded.
     836      437363 :                 nStartBlockX = -nBlockXSize;
     837             :             }
     838             : 
     839     1970110 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     840     1970110 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     841             : 
     842     1970110 :             if (bIntegerXFactor)
     843             :             {
     844      695850 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     845      695850 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     846      695850 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     847      695850 :                 bool bRet = false;
     848      695850 :                 if (bByteCopy)
     849             :                 {
     850      585842 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     851             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     852             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
     853             :                         GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
     854             :                         nLBlockY);
     855             :                 }
     856      110008 :                 else if (eDataType == eBufType)
     857             :                 {
     858      109783 :                     switch (nBandDataSize)
     859             :                     {
     860      109630 :                         case 2:
     861      109630 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     862             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     863             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     864             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     865             :                                 nBlockXSize, poBlock, nLBlockY);
     866      109630 :                             break;
     867          55 :                         case 4:
     868          55 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     869             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     870             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     871             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     872             :                                 nBlockXSize, poBlock, nLBlockY);
     873          55 :                             break;
     874          96 :                         case 8:
     875          96 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     876             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     877             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     878             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     879             :                                 nBlockXSize, poBlock, nLBlockY);
     880          96 :                             break;
     881           2 :                         case 16:
     882           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     883             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     884             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     885             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     886             :                                 nBlockXSize, poBlock, nLBlockY);
     887           2 :                             break;
     888           0 :                         default:
     889           0 :                             CPLAssert(false);
     890             :                             break;
     891             :                     }
     892             :                 }
     893             :                 else
     894             :                 {
     895         225 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     896             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     897             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     898             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     899             :                 }
     900      695850 :                 if (!bRet)
     901           1 :                     eErr = CE_Failure;
     902             :             }
     903             :             else
     904             :             {
     905     1274260 :                 double dfSrcX = dfSrcXStart;
     906   503811000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     907   502537000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     908             :                 {
     909             :                     // TODO?: try to avoid the clamping for most iterations
     910             :                     const int iSrcX = static_cast<int>(
     911  1005070000 :                         std::min(std::max(0.0, dfSrcX),
     912   502537000 :                                  static_cast<double>(nRasterXSize - 1)));
     913             : 
     914             :                     /* --------------------------------------------------------------------
     915             :                      */
     916             :                     /*      Ensure we have the appropriate block loaded. */
     917             :                     /* --------------------------------------------------------------------
     918             :                      */
     919   502537000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     920             :                     {
     921     1697820 :                         const int nLBlockX = iSrcX / nBlockXSize;
     922     1697820 :                         nStartBlockX = nLBlockX * nBlockXSize;
     923             : 
     924     1697820 :                         if (poBlock != nullptr)
     925     1574650 :                             poBlock->DropLock();
     926             : 
     927     1697820 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     928     1697820 :                         if (poBlock == nullptr)
     929             :                         {
     930           9 :                             eErr = CE_Failure;
     931           9 :                             break;
     932             :                         }
     933             : 
     934             :                         pabySrcBlock =
     935     1697810 :                             static_cast<GByte *>(poBlock->GetDataRef());
     936             :                     }
     937   502537000 :                     const GPtrDiff_t nDiffX =
     938   502537000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     939             : 
     940             :                     /* --------------------------------------------------------------------
     941             :                      */
     942             :                     /*      Copy over this pixel of data. */
     943             :                     /* --------------------------------------------------------------------
     944             :                      */
     945             : 
     946   502537000 :                     if (bByteCopy)
     947             :                     {
     948   442592000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     949   442592000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     950   442592000 :                             pabySrcBlock[iSrcOffset];
     951             :                     }
     952    59944700 :                     else if (eDataType == eBufType)
     953             :                     {
     954    50322800 :                         GPtrDiff_t iSrcOffset =
     955    50322800 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     956    50322800 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     957    50322800 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     958             :                     }
     959             :                     else
     960             :                     {
     961             :                         // Type to type conversion ...
     962     9621890 :                         GPtrDiff_t iSrcOffset =
     963     9621890 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     964     9621890 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
     965             :                                         static_cast<GByte *>(pData) +
     966     9621890 :                                             iBufOffset,
     967             :                                         eBufType, 0, 1);
     968             :                     }
     969             : 
     970   502537000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     971             :                 }
     972             :             }
     973     1970110 :             if (eErr == CE_Failure)
     974          11 :                 break;
     975             : 
     976     2191530 :             if (psExtraArg->pfnProgress != nullptr &&
     977      221434 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     978             :                                          psExtraArg->pProgressData))
     979             :             {
     980           1 :                 eErr = CE_Failure;
     981           1 :                 break;
     982             :             }
     983             :         }
     984             :     }
     985             : 
     986      563655 :     if (poBlock != nullptr)
     987      563645 :         poBlock->DropLock();
     988             : 
     989      563655 :     return eErr;
     990             : }
     991             : 
     992             : /************************************************************************/
     993             : /*                      GDALRasterIOTransformer()                       */
     994             : /************************************************************************/
     995             : 
     996             : struct GDALRasterIOTransformerStruct
     997             : {
     998             :     double dfXOff;
     999             :     double dfYOff;
    1000             :     double dfXRatioDstToSrc;
    1001             :     double dfYRatioDstToSrc;
    1002             : };
    1003             : 
    1004        6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
    1005             :                                    int nPointCount, double *x, double *y,
    1006             :                                    double * /* z */, int *panSuccess)
    1007             : {
    1008        6897 :     GDALRasterIOTransformerStruct *psParams =
    1009             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
    1010        6897 :     if (bDstToSrc)
    1011             :     {
    1012      311993 :         for (int i = 0; i < nPointCount; i++)
    1013             :         {
    1014      305684 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
    1015      305684 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
    1016      305684 :             panSuccess[i] = TRUE;
    1017             :         }
    1018             :     }
    1019             :     else
    1020             :     {
    1021        1176 :         for (int i = 0; i < nPointCount; i++)
    1022             :         {
    1023         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
    1024         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
    1025         588 :             panSuccess[i] = TRUE;
    1026             :         }
    1027             :     }
    1028        6897 :     return TRUE;
    1029             : }
    1030             : 
    1031             : /************************************************************************/
    1032             : /*                         RasterIOResampled()                          */
    1033             : /************************************************************************/
    1034             : 
    1035             : //! @cond Doxygen_Suppress
    1036       14258 : CPLErr GDALRasterBand::RasterIOResampled(
    1037             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1038             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1039             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1040             : {
    1041             :     // Determine if we use warping resampling or overview resampling
    1042             :     const bool bUseWarp =
    1043       14258 :         (GDALDataTypeIsComplex(eDataType) &&
    1044       14417 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1045         159 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1046             : 
    1047       14258 :     double dfXOff = nXOff;
    1048       14258 :     double dfYOff = nYOff;
    1049       14258 :     double dfXSize = nXSize;
    1050       14258 :     double dfYSize = nYSize;
    1051       14258 :     if (psExtraArg->bFloatingPointWindowValidity)
    1052             :     {
    1053       13512 :         dfXOff = psExtraArg->dfXOff;
    1054       13512 :         dfYOff = psExtraArg->dfYOff;
    1055       13512 :         dfXSize = psExtraArg->dfXSize;
    1056       13512 :         dfYSize = psExtraArg->dfYSize;
    1057             :     }
    1058             : 
    1059       14258 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1060       14258 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1061             : 
    1062             :     // Determine the coordinates in the "virtual" output raster to see
    1063             :     // if there are not integers, in which case we will use them as a shift
    1064             :     // so that subwindow extracts give the exact same results as entire raster
    1065             :     // scaling.
    1066       14258 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1067       14258 :     bool bHasXOffVirtual = false;
    1068       14258 :     int nDestXOffVirtual = 0;
    1069       14258 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1070             :     {
    1071       13930 :         bHasXOffVirtual = true;
    1072       13930 :         dfXOff = nXOff;
    1073       13930 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1074             :     }
    1075             : 
    1076       14258 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1077       14258 :     bool bHasYOffVirtual = false;
    1078       14258 :     int nDestYOffVirtual = 0;
    1079       14258 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1080             :     {
    1081       13926 :         bHasYOffVirtual = true;
    1082       13926 :         dfYOff = nYOff;
    1083       13926 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1084             :     }
    1085             : 
    1086             :     // Create a MEM dataset that wraps the output buffer.
    1087             :     GDALDataset *poMEMDS;
    1088       14258 :     void *pTempBuffer = nullptr;
    1089       14258 :     GSpacing nPSMem = nPixelSpace;
    1090       14258 :     GSpacing nLSMem = nLineSpace;
    1091       14258 :     void *pDataMem = pData;
    1092       14258 :     GDALDataType eDTMem = eBufType;
    1093       14258 :     if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
    1094             :     {
    1095           4 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1096           4 :         nLSMem = nPSMem * nBufXSize;
    1097             :         pTempBuffer =
    1098           4 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1099           4 :         if (pTempBuffer == nullptr)
    1100           0 :             return CE_Failure;
    1101           4 :         pDataMem = pTempBuffer;
    1102           4 :         eDTMem = eDataType;
    1103             :     }
    1104             : 
    1105             :     poMEMDS =
    1106       14258 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1107             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1108       14258 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1109       14258 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1110       14258 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1111             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1112       14258 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1113             : 
    1114       14258 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1115       14258 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1116       14258 :     if (pszNBITS)
    1117           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1118           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1119             : 
    1120       14258 :     CPLErr eErr = CE_None;
    1121             : 
    1122             :     // Do the resampling.
    1123       14258 :     if (bUseWarp)
    1124             :     {
    1125         149 :         int bHasNoData = FALSE;
    1126         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1127             : 
    1128         149 :         VRTDatasetH hVRTDS = nullptr;
    1129         149 :         GDALRasterBandH hVRTBand = nullptr;
    1130         149 :         if (GetDataset() == nullptr)
    1131             :         {
    1132             :             /* Create VRT dataset that wraps the whole dataset */
    1133           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1134           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1135           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1136           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1137             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1138             :                                VRT_NODATA_UNSET);
    1139             : 
    1140             :             /* Add a mask band if needed */
    1141           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1142             :             {
    1143           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1144             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1145             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1146             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1147           0 :                             ->GetMaskBand());
    1148           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1149           0 :                                                  nRasterYSize, 0, 0,
    1150           0 :                                                  nRasterXSize, nRasterYSize);
    1151             :             }
    1152             :         }
    1153             : 
    1154         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1155         149 :         switch (psExtraArg->eResampleAlg)
    1156             :         {
    1157           0 :             case GRIORA_NearestNeighbour:
    1158           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1159           0 :                 break;
    1160         147 :             case GRIORA_Bilinear:
    1161         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1162         147 :                 break;
    1163           0 :             case GRIORA_Cubic:
    1164           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1165           0 :                 break;
    1166           0 :             case GRIORA_CubicSpline:
    1167           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1168           0 :                 break;
    1169           0 :             case GRIORA_Lanczos:
    1170           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1171           0 :                 break;
    1172           0 :             case GRIORA_Average:
    1173           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1174           0 :                 break;
    1175           2 :             case GRIORA_RMS:
    1176           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1177           2 :                 break;
    1178           0 :             case GRIORA_Mode:
    1179           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1180           0 :                 break;
    1181           0 :             default:
    1182           0 :                 CPLAssert(false);
    1183             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1184             :                 break;
    1185             :         }
    1186         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1187         149 :         psWarpOptions->hDstDS = poMEMDS;
    1188         149 :         psWarpOptions->nBandCount = 1;
    1189         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1190         149 :         int nDstBandNumber = 1;
    1191         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1192         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1193         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1194         149 :                                          ? psExtraArg->pfnProgress
    1195             :                                          : GDALDummyProgress;
    1196         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1197         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1198         149 :         if (bHasNoData)
    1199             :         {
    1200           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1201             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1202           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1203             :             {
    1204           0 :                 psWarpOptions->padfSrcNoDataReal =
    1205           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1206           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1207             :             }
    1208             : 
    1209           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1210             :             {
    1211           0 :                 psWarpOptions->padfDstNoDataReal =
    1212           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1213           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1214             :             }
    1215             :         }
    1216             : 
    1217             :         GDALRasterIOTransformerStruct sTransformer;
    1218         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1219         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1220         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1221         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1222         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1223             : 
    1224             :         GDALWarpOperationH hWarpOperation =
    1225         149 :             GDALCreateWarpOperation(psWarpOptions);
    1226         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1227             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1228         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1229             : 
    1230         149 :         psWarpOptions->panSrcBands = nullptr;
    1231         149 :         psWarpOptions->panDstBands = nullptr;
    1232         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1233             : 
    1234         149 :         if (hVRTDS)
    1235           0 :             GDALClose(hVRTDS);
    1236             :     }
    1237             :     else
    1238             :     {
    1239             :         const char *pszResampling =
    1240       14109 :             GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
    1241       14109 :         int nKernelRadius = 0;
    1242             :         GDALResampleFunction pfnResampleFunc =
    1243       14109 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1244       14109 :         CPLAssert(pfnResampleFunc);
    1245             :         GDALDataType eWrkDataType =
    1246       14109 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1247       14109 :         int nHasNoData = 0;
    1248       14109 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1249       14109 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1250       14109 :         if (!bHasNoData)
    1251       13977 :             dfNoDataValue = 0.0;
    1252             : 
    1253       14109 :         int nDstBlockXSize = nBufXSize;
    1254       14109 :         int nDstBlockYSize = nBufYSize;
    1255       14109 :         int nFullResXChunk = 0;
    1256       14109 :         int nFullResYChunk = 0;
    1257             :         while (true)
    1258             :         {
    1259       14120 :             nFullResXChunk =
    1260       14120 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1261       14120 :             nFullResYChunk =
    1262       14120 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1263       14120 :             if (nFullResXChunk > nRasterXSize)
    1264        4777 :                 nFullResXChunk = nRasterXSize;
    1265       14120 :             if (nFullResYChunk > nRasterYSize)
    1266         594 :                 nFullResYChunk = nRasterYSize;
    1267       14120 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1268       14062 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1269             :                  1024 * 1024))
    1270             :                 break;
    1271             :             // When operating on the full width of a raster whose block width is
    1272             :             // the raster width, prefer doing chunks in height.
    1273          11 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1274             :                 nDstBlockYSize > 1)
    1275           0 :                 nDstBlockYSize /= 2;
    1276             :             /* Otherwise cut the maximal dimension */
    1277          11 :             else if (nDstBlockXSize > 1 &&
    1278           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1279          11 :                 nDstBlockXSize /= 2;
    1280             :             else
    1281           0 :                 nDstBlockYSize /= 2;
    1282             :         }
    1283             : 
    1284       14109 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1285       14109 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1286       14109 :         if (nOvrXFactor == 0)
    1287        2029 :             nOvrXFactor = 1;
    1288       14109 :         if (nOvrYFactor == 0)
    1289        2028 :             nOvrYFactor = 1;
    1290       14109 :         int nFullResXSizeQueried =
    1291       14109 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1292       14109 :         int nFullResYSizeQueried =
    1293       14109 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1294             : 
    1295       14109 :         if (nFullResXSizeQueried > nRasterXSize)
    1296        2734 :             nFullResXSizeQueried = nRasterXSize;
    1297       14109 :         if (nFullResYSizeQueried > nRasterYSize)
    1298         332 :             nFullResYSizeQueried = nRasterYSize;
    1299             : 
    1300             :         void *pChunk =
    1301       14109 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1302             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1303       14109 :         GByte *pabyChunkNoDataMask = nullptr;
    1304             : 
    1305       14109 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1306       14109 :         int l_nMaskFlags = GetMaskFlags();
    1307             : 
    1308       14109 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1309       14109 :         if (bUseNoDataMask)
    1310             :         {
    1311        7525 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1312             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1313             :         }
    1314       14109 :         if (pChunk == nullptr ||
    1315        7525 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1316             :         {
    1317           0 :             GDALClose(poMEMDS);
    1318           0 :             CPLFree(pChunk);
    1319           0 :             CPLFree(pabyChunkNoDataMask);
    1320           0 :             VSIFree(pTempBuffer);
    1321           0 :             return CE_Failure;
    1322             :         }
    1323             : 
    1324       14109 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1325       14109 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1326       14109 :         int nBlocksDone = 0;
    1327             : 
    1328             :         int nDstYOff;
    1329       28218 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1330       14109 :              nDstYOff += nDstBlockYSize)
    1331             :         {
    1332             :             int nDstYCount;
    1333       14109 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1334       14109 :                 nDstYCount = nDstBlockYSize;
    1335             :             else
    1336           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1337             : 
    1338       14109 :             int nChunkYOff =
    1339       14109 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1340       14109 :             int nChunkYOff2 = nYOff + 1 +
    1341       14109 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1342             :                                                     dfYRatioDstToSrc));
    1343       14109 :             if (nChunkYOff2 > nRasterYSize)
    1344         782 :                 nChunkYOff2 = nRasterYSize;
    1345       14109 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1346       14109 :             CPLAssert(nYCount <= nFullResYChunk);
    1347             : 
    1348       14109 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1349       14109 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1350       14109 :             if (nChunkYOffQueried < 0)
    1351             :             {
    1352         491 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1353         491 :                 nChunkYOffQueried = 0;
    1354             :             }
    1355       14109 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1356         594 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1357       14109 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1358             : 
    1359       14109 :             int nDstXOff = 0;
    1360       28218 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1361       14109 :                  nDstXOff += nDstBlockXSize)
    1362             :             {
    1363       14109 :                 int nDstXCount = 0;
    1364       14109 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1365       14109 :                     nDstXCount = nDstBlockXSize;
    1366             :                 else
    1367           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1368             : 
    1369       14109 :                 int nChunkXOff =
    1370       14109 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1371       14109 :                 int nChunkXOff2 =
    1372       14109 :                     nXOff + 1 +
    1373       14109 :                     static_cast<int>(
    1374       14109 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1375       14109 :                 if (nChunkXOff2 > nRasterXSize)
    1376        8802 :                     nChunkXOff2 = nRasterXSize;
    1377       14109 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1378       14109 :                 CPLAssert(nXCount <= nFullResXChunk);
    1379             : 
    1380       14109 :                 int nChunkXOffQueried =
    1381       14109 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1382       14109 :                 int nChunkXSizeQueried =
    1383       14109 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1384       14109 :                 if (nChunkXOffQueried < 0)
    1385             :                 {
    1386        2795 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1387        2795 :                     nChunkXOffQueried = 0;
    1388             :                 }
    1389       14109 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1390        2781 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1391       14109 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1392             : 
    1393             :                 // Read the source buffers.
    1394       14109 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1395             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1396             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1397             :                                 eWrkDataType, 0, 0, nullptr);
    1398             : 
    1399       14109 :                 bool bSkipResample = false;
    1400       14109 :                 bool bNoDataMaskFullyOpaque = false;
    1401       14109 :                 if (eErr == CE_None && bUseNoDataMask)
    1402             :                 {
    1403        7525 :                     eErr = poMaskBand->RasterIO(
    1404             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1405             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1406             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1407             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1408             : 
    1409             :                     /* Optimizations if mask if fully opaque or transparent */
    1410        7525 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1411        7525 :                     GByte bVal = pabyChunkNoDataMask[0];
    1412        7525 :                     int i = 1;
    1413    15237000 :                     for (; i < nPixels; i++)
    1414             :                     {
    1415    15230700 :                         if (pabyChunkNoDataMask[i] != bVal)
    1416        1168 :                             break;
    1417             :                     }
    1418        7525 :                     if (i == nPixels)
    1419             :                     {
    1420        6357 :                         if (bVal == 0)
    1421             :                         {
    1422       12094 :                             for (int j = 0; j < nDstYCount; j++)
    1423             :                             {
    1424        6377 :                                 GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
    1425             :                                                 static_cast<GByte *>(pDataMem) +
    1426        6377 :                                                     nLSMem * (j + nDstYOff) +
    1427        6377 :                                                     nDstXOff * nPSMem,
    1428             :                                                 eDTMem,
    1429             :                                                 static_cast<int>(nPSMem),
    1430             :                                                 nDstXCount);
    1431             :                             }
    1432        5717 :                             bSkipResample = true;
    1433             :                         }
    1434             :                         else
    1435             :                         {
    1436         640 :                             bNoDataMaskFullyOpaque = true;
    1437             :                         }
    1438             :                     }
    1439             :                 }
    1440             : 
    1441       14109 :                 if (!bSkipResample && eErr == CE_None)
    1442             :                 {
    1443        8389 :                     const bool bPropagateNoData = false;
    1444        8389 :                     void *pDstBuffer = nullptr;
    1445        8389 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1446             :                     GDALRasterBand *poMEMBand =
    1447        8389 :                         GDALRasterBand::FromHandle(hMEMBand);
    1448        8389 :                     GDALOverviewResampleArgs args;
    1449        8389 :                     args.eSrcDataType = eDataType;
    1450        8389 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1451        8389 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1452        8389 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1453        8389 :                     args.nOvrNBITS = nNBITS;
    1454        8389 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1455        8389 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1456        8389 :                     args.dfSrcXDelta =
    1457        8389 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1458        8389 :                     args.dfSrcYDelta =
    1459        8389 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1460        8389 :                     args.eWrkDataType = eWrkDataType;
    1461        8389 :                     args.pabyChunkNodataMask =
    1462        8389 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1463        8389 :                     args.nChunkXOff =
    1464        8389 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1465        8389 :                     args.nChunkXSize = nChunkXSizeQueried;
    1466        8389 :                     args.nChunkYOff =
    1467        8389 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1468        8389 :                     args.nChunkYSize = nChunkYSizeQueried;
    1469        8389 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1470        8389 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1471        8389 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1472        8389 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1473        8389 :                     args.pszResampling = pszResampling;
    1474        8389 :                     args.bHasNoData = bHasNoData;
    1475        8389 :                     args.dfNoDataValue = dfNoDataValue;
    1476        8389 :                     args.poColorTable = GetColorTable();
    1477        8389 :                     args.bPropagateNoData = bPropagateNoData;
    1478        8389 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1479             :                                            &eDstBufferDataType);
    1480        8389 :                     if (eErr == CE_None)
    1481             :                     {
    1482        8389 :                         eErr = poMEMBand->RasterIO(
    1483             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1484             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1485             :                             pDstBuffer, nDstXCount, nDstYCount,
    1486             :                             eDstBufferDataType, 0, 0, nullptr);
    1487             :                     }
    1488        8389 :                     CPLFree(pDstBuffer);
    1489             :                 }
    1490             : 
    1491       14109 :                 nBlocksDone++;
    1492       25031 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1493       10922 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1494             :                                              "", psExtraArg->pProgressData))
    1495             :                 {
    1496           1 :                     eErr = CE_Failure;
    1497             :                 }
    1498             :             }
    1499             :         }
    1500             : 
    1501       14109 :         CPLFree(pChunk);
    1502       14109 :         CPLFree(pabyChunkNoDataMask);
    1503             :     }
    1504             : 
    1505       14258 :     if (pTempBuffer)
    1506             :     {
    1507           4 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1508             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1509             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1510             :             nullptr));
    1511             :     }
    1512       14258 :     GDALClose(poMEMDS);
    1513       14258 :     VSIFree(pTempBuffer);
    1514             : 
    1515       14258 :     return eErr;
    1516             : }
    1517             : 
    1518             : /************************************************************************/
    1519             : /*                         RasterIOResampled()                          */
    1520             : /************************************************************************/
    1521             : 
    1522         892 : CPLErr GDALDataset::RasterIOResampled(
    1523             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1524             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1525             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1526             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1527             : 
    1528             : {
    1529             : #if 0
    1530             :     // Determine if we use warping resampling or overview resampling
    1531             :     bool bUseWarp = false;
    1532             :     if( GDALDataTypeIsComplex( eDataType ) )
    1533             :         bUseWarp = true;
    1534             : #endif
    1535             : 
    1536         892 :     double dfXOff = nXOff;
    1537         892 :     double dfYOff = nYOff;
    1538         892 :     double dfXSize = nXSize;
    1539         892 :     double dfYSize = nYSize;
    1540         892 :     if (psExtraArg->bFloatingPointWindowValidity)
    1541             :     {
    1542         765 :         dfXOff = psExtraArg->dfXOff;
    1543         765 :         dfYOff = psExtraArg->dfYOff;
    1544         765 :         dfXSize = psExtraArg->dfXSize;
    1545         765 :         dfYSize = psExtraArg->dfYSize;
    1546             :     }
    1547             : 
    1548         892 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1549         892 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1550             : 
    1551             :     // Determine the coordinates in the "virtual" output raster to see
    1552             :     // if there are not integers, in which case we will use them as a shift
    1553             :     // so that subwindow extracts give the exact same results as entire raster
    1554             :     // scaling.
    1555         892 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1556         892 :     bool bHasXOffVirtual = false;
    1557         892 :     int nDestXOffVirtual = 0;
    1558         892 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1559             :     {
    1560         767 :         bHasXOffVirtual = true;
    1561         767 :         dfXOff = nXOff;
    1562         767 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1563             :     }
    1564             : 
    1565         892 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1566         892 :     bool bHasYOffVirtual = false;
    1567         892 :     int nDestYOffVirtual = 0;
    1568         892 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1569             :     {
    1570         727 :         bHasYOffVirtual = true;
    1571         727 :         dfYOff = nYOff;
    1572         727 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1573             :     }
    1574             : 
    1575             :     // Create a MEM dataset that wraps the output buffer.
    1576         892 :     std::unique_ptr<void, VSIFreeReleaser> pTempBuffer;
    1577         892 :     GSpacing nPSMem = nPixelSpace;
    1578         892 :     GSpacing nLSMem = nLineSpace;
    1579         892 :     GSpacing nBandSpaceMEM = nBandSpace;
    1580         892 :     void *pDataMem = pData;
    1581         892 :     GDALDataType eDTMem = eBufType;
    1582         892 :     GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1583         892 :     const GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1584         892 :     if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
    1585             :     {
    1586           2 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1587           2 :         nLSMem = nPSMem * nBufXSize;
    1588           2 :         nBandSpaceMEM = nLSMem * nBandCount;
    1589           2 :         pTempBuffer.reset(VSI_MALLOC3_VERBOSE(nBandCount, nBufYSize,
    1590             :                                               static_cast<size_t>(nLSMem)));
    1591           2 :         if (pTempBuffer == nullptr)
    1592           0 :             return CE_Failure;
    1593           2 :         pDataMem = pTempBuffer.get();
    1594           2 :         eDTMem = eDataType;
    1595             :     }
    1596             : 
    1597             :     auto poMEMDS = std::unique_ptr<GDALDataset>(
    1598         892 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1599        1784 :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr));
    1600             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1601             :     std::vector<GDALRasterBand *> apoDstBands(nBandCount);
    1602             : #endif
    1603         892 :     int nNBITS = 0;
    1604        2896 :     for (int i = 0; i < nBandCount; i++)
    1605             :     {
    1606        2004 :         GByte *const pBandData = static_cast<GByte *>(pDataMem) -
    1607        2004 :                                  nPSMem * nDestXOffVirtual -
    1608        2004 :                                  nLSMem * nDestYOffVirtual + nBandSpaceMEM * i;
    1609        2004 :         auto poMEMBand = GDALRasterBand::FromHandle(MEMCreateRasterBandEx(
    1610             :             poMEMDS.get(), i + 1, pBandData, eDTMem, nPSMem, nLSMem, false));
    1611        2004 :         poMEMDS->SetBand(i + 1, poMEMBand);
    1612             : 
    1613        2004 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1614             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1615             :         apoDstBands[i] = poMEMBand;
    1616             : #endif
    1617             :         const char *pszNBITS =
    1618        2004 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1619        2004 :         if (pszNBITS)
    1620             :         {
    1621           0 :             nNBITS = atoi(pszNBITS);
    1622           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1623           0 :                                                            "IMAGE_STRUCTURE");
    1624             :         }
    1625             :     }
    1626             : 
    1627         892 :     CPLErr eErr = CE_None;
    1628             : 
    1629             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1630             :     // Looks like this code was initially added as disable by copying
    1631             :     // from RasterIO here:
    1632             :     // https://trac.osgeo.org/gdal/changeset/29572
    1633             : #if 0
    1634             :     // Do the resampling.
    1635             :     if( bUseWarp )
    1636             :     {
    1637             :         VRTDatasetH hVRTDS = nullptr;
    1638             :         GDALRasterBandH hVRTBand = nullptr;
    1639             :         if( GetDataset() == nullptr )
    1640             :         {
    1641             :             /* Create VRT dataset that wraps the whole dataset */
    1642             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1643             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1644             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1645             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1646             :                                 (GDALRasterBandH)this,
    1647             :                                 0, 0,
    1648             :                                 nRasterXSize, nRasterYSize,
    1649             :                                 0, 0,
    1650             :                                 nRasterXSize, nRasterYSize,
    1651             :                                 nullptr, VRT_NODATA_UNSET );
    1652             : 
    1653             :             /* Add a mask band if needed */
    1654             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1655             :             {
    1656             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1657             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1658             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1659             :                 poVRTMaskBand->
    1660             :                     AddMaskBandSource( this,
    1661             :                                     0, 0,
    1662             :                                     nRasterXSize, nRasterYSize,
    1663             :                                     0, 0,
    1664             :                                     nRasterXSize, nRasterYSize);
    1665             :             }
    1666             :         }
    1667             : 
    1668             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1669             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1670             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1671             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1672             :         psWarpOptions->nBandCount = 1;
    1673             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1674             :         int nDstBandNumber = 1;
    1675             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1676             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1677             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1678             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1679             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1680             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1681             :         GDALRasterIOTransformerStruct sTransformer;
    1682             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1683             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1684             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1685             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1686             :         psWarpOptions->pTransformerArg = &sTransformer;
    1687             : 
    1688             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1689             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1690             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1691             :                                       nBufXSize, nBufYSize );
    1692             :         GDALDestroyWarpOperation( hWarpOperation );
    1693             : 
    1694             :         psWarpOptions->panSrcBands = nullptr;
    1695             :         psWarpOptions->panDstBands = nullptr;
    1696             :         GDALDestroyWarpOptions( psWarpOptions );
    1697             : 
    1698             :         if( hVRTDS )
    1699             :             GDALClose(hVRTDS);
    1700             :     }
    1701             :     else
    1702             : #endif
    1703             :     {
    1704             :         const char *pszResampling =
    1705         892 :             GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
    1706             : 
    1707             :         int nBlockXSize, nBlockYSize;
    1708         892 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1709             : 
    1710             :         int nKernelRadius;
    1711             :         GDALResampleFunction pfnResampleFunc =
    1712         892 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1713         892 :         CPLAssert(pfnResampleFunc);
    1714             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1715             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1716             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1717             : #endif
    1718             :         GDALDataType eWrkDataType =
    1719         892 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1720             : 
    1721         892 :         int nDstBlockXSize = nBufXSize;
    1722         892 :         int nDstBlockYSize = nBufYSize;
    1723             :         int nFullResXChunk, nFullResYChunk;
    1724             :         while (true)
    1725             :         {
    1726         892 :             nFullResXChunk =
    1727         892 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1728         892 :             nFullResYChunk =
    1729         892 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1730         892 :             if (nFullResXChunk > nRasterXSize)
    1731         591 :                 nFullResXChunk = nRasterXSize;
    1732         892 :             if (nFullResYChunk > nRasterYSize)
    1733          57 :                 nFullResYChunk = nRasterYSize;
    1734         892 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1735         890 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1736             :                  1024 * 1024))
    1737             :                 break;
    1738             :             // When operating on the full width of a raster whose block width is
    1739             :             // the raster width, prefer doing chunks in height.
    1740           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1741             :                 nDstBlockYSize > 1)
    1742           0 :                 nDstBlockYSize /= 2;
    1743             :             /* Otherwise cut the maximal dimension */
    1744           0 :             else if (nDstBlockXSize > 1 &&
    1745           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1746           0 :                 nDstBlockXSize /= 2;
    1747             :             else
    1748           0 :                 nDstBlockYSize /= 2;
    1749             :         }
    1750             : 
    1751        1784 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1752         892 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1753         892 :         if (nOvrFactor == 0)
    1754         104 :             nOvrFactor = 1;
    1755         892 :         int nFullResXSizeQueried =
    1756         892 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1757         892 :         int nFullResYSizeQueried =
    1758         892 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1759             : 
    1760         892 :         if (nFullResXSizeQueried > nRasterXSize)
    1761         616 :             nFullResXSizeQueried = nRasterXSize;
    1762         892 :         if (nFullResYSizeQueried > nRasterYSize)
    1763          60 :             nFullResYSizeQueried = nRasterYSize;
    1764             : 
    1765         892 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1766             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1767             :                               nBandCount),
    1768             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1769         892 :         GByte *pabyChunkNoDataMask = nullptr;
    1770             : 
    1771         892 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1772         892 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1773             : 
    1774         892 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1775         892 :         if (bUseNoDataMask)
    1776             :         {
    1777         617 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1778             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1779             :         }
    1780         892 :         if (pChunk == nullptr ||
    1781         617 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1782             :         {
    1783           0 :             CPLFree(pChunk);
    1784           0 :             CPLFree(pabyChunkNoDataMask);
    1785           0 :             return CE_Failure;
    1786             :         }
    1787             : 
    1788         892 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1789         892 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1790         892 :         int nBlocksDone = 0;
    1791             : 
    1792             :         int nDstYOff;
    1793        1784 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1794         892 :              nDstYOff += nDstBlockYSize)
    1795             :         {
    1796             :             int nDstYCount;
    1797         892 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1798         892 :                 nDstYCount = nDstBlockYSize;
    1799             :             else
    1800           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1801             : 
    1802         892 :             int nChunkYOff =
    1803         892 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1804         892 :             int nChunkYOff2 = nYOff + 1 +
    1805         892 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1806             :                                                     dfYRatioDstToSrc));
    1807         892 :             if (nChunkYOff2 > nRasterYSize)
    1808         139 :                 nChunkYOff2 = nRasterYSize;
    1809         892 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1810         892 :             CPLAssert(nYCount <= nFullResYChunk);
    1811             : 
    1812         892 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1813         892 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1814         892 :             if (nChunkYOffQueried < 0)
    1815             :             {
    1816         142 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1817         142 :                 nChunkYOffQueried = 0;
    1818             :             }
    1819         892 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1820         157 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1821         892 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1822             : 
    1823             :             int nDstXOff;
    1824        1784 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1825         892 :                  nDstXOff += nDstBlockXSize)
    1826             :             {
    1827             :                 int nDstXCount;
    1828         892 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1829         892 :                     nDstXCount = nDstBlockXSize;
    1830             :                 else
    1831           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1832             : 
    1833         892 :                 int nChunkXOff =
    1834         892 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1835         892 :                 int nChunkXOff2 =
    1836         892 :                     nXOff + 1 +
    1837         892 :                     static_cast<int>(
    1838         892 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1839         892 :                 if (nChunkXOff2 > nRasterXSize)
    1840         647 :                     nChunkXOff2 = nRasterXSize;
    1841         892 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1842         892 :                 CPLAssert(nXCount <= nFullResXChunk);
    1843             : 
    1844         892 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1845         892 :                 int nChunkXSizeQueried =
    1846         892 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1847         892 :                 if (nChunkXOffQueried < 0)
    1848             :                 {
    1849         647 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1850         647 :                     nChunkXOffQueried = 0;
    1851             :                 }
    1852         892 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1853         655 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1854         892 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1855             : 
    1856         892 :                 bool bSkipResample = false;
    1857         892 :                 bool bNoDataMaskFullyOpaque = false;
    1858         892 :                 if (eErr == CE_None && bUseNoDataMask)
    1859             :                 {
    1860         617 :                     eErr = poMaskBand->RasterIO(
    1861             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1862             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1863             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1864             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1865             : 
    1866             :                     /* Optimizations if mask if fully opaque or transparent */
    1867         617 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1868         617 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1869         617 :                     int i = 1;  // Used after for.
    1870    48197000 :                     for (; i < nPixels; i++)
    1871             :                     {
    1872    48196500 :                         if (pabyChunkNoDataMask[i] != bVal)
    1873          72 :                             break;
    1874             :                     }
    1875         617 :                     if (i == nPixels)
    1876             :                     {
    1877         545 :                         if (bVal == 0)
    1878             :                         {
    1879         373 :                             GByte abyZero[16] = {0};
    1880         780 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1881             :                             {
    1882        3499 :                                 for (int j = 0; j < nDstYCount; j++)
    1883             :                                 {
    1884        3092 :                                     GDALCopyWords64(
    1885             :                                         abyZero, GDT_UInt8, 0,
    1886             :                                         static_cast<GByte *>(pDataMem) +
    1887        3092 :                                             iBand * nBandSpaceMEM +
    1888        3092 :                                             nLSMem * (j + nDstYOff) +
    1889        3092 :                                             nDstXOff * nPSMem,
    1890             :                                         eBufType, static_cast<int>(nPSMem),
    1891             :                                         nDstXCount);
    1892             :                                 }
    1893             :                             }
    1894         373 :                             bSkipResample = true;
    1895             :                         }
    1896             :                         else
    1897             :                         {
    1898         172 :                             bNoDataMaskFullyOpaque = true;
    1899             :                         }
    1900             :                     }
    1901             :                 }
    1902             : 
    1903         892 :                 if (!bSkipResample && eErr == CE_None)
    1904             :                 {
    1905             :                     /* Read the source buffers */
    1906         516 :                     eErr = RasterIO(
    1907             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1908             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1909             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1910             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1911             :                 }
    1912             : 
    1913             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1914             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1915             :                     eErr == CE_None)
    1916             :                 {
    1917             :                     eErr = pfnResampleFuncMultiBands(
    1918             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1919             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1920             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1921             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1922             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1923             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1924             :                         nChunkXSizeQueried,
    1925             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1926             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1927             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1928             :                         nDstYOff + nDestYOffVirtual,
    1929             :                         nDstYOff + nDestYOffVirtual + nDstYCount,
    1930             :                         apoDstBands.data(), pszResampling, FALSE /*bHasNoData*/,
    1931             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1932             :                         eDataType);
    1933             :                 }
    1934             :                 else
    1935             : #endif
    1936             :                 {
    1937             :                     size_t nChunkBandOffset =
    1938         892 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1939         892 :                         nChunkYSizeQueried *
    1940         892 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1941        2480 :                     for (int i = 0;
    1942        2480 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1943             :                          i++)
    1944             :                     {
    1945        1588 :                         const bool bPropagateNoData = false;
    1946        1588 :                         void *pDstBuffer = nullptr;
    1947        1588 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1948             :                         GDALRasterBand *poMEMBand =
    1949        1588 :                             poMEMDS->GetRasterBand(i + 1);
    1950        1588 :                         GDALOverviewResampleArgs args;
    1951        1588 :                         args.eSrcDataType = eDataType;
    1952        1588 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1953        1588 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1954        1588 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1955        1588 :                         args.nOvrNBITS = nNBITS;
    1956        1588 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1957        1588 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1958        1588 :                         args.dfSrcXDelta =
    1959        1588 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1960        1588 :                         args.dfSrcYDelta =
    1961        1588 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1962        1588 :                         args.eWrkDataType = eWrkDataType;
    1963        1588 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1964        1588 :                                                        ? nullptr
    1965             :                                                        : pabyChunkNoDataMask;
    1966        1588 :                         args.nChunkXOff =
    1967        1588 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1968        1588 :                         args.nChunkXSize = nChunkXSizeQueried;
    1969        1588 :                         args.nChunkYOff =
    1970        1588 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1971        1588 :                         args.nChunkYSize = nChunkYSizeQueried;
    1972        1588 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1973        1588 :                         args.nDstXOff2 =
    1974        1588 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1975        1588 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1976        1588 :                         args.nDstYOff2 =
    1977        1588 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1978        1588 :                         args.pszResampling = pszResampling;
    1979        1588 :                         args.bHasNoData = false;
    1980        1588 :                         args.dfNoDataValue = 0.0;
    1981        1588 :                         args.poColorTable = nullptr;
    1982        1588 :                         args.bPropagateNoData = bPropagateNoData;
    1983             : 
    1984             :                         eErr =
    1985        3176 :                             pfnResampleFunc(args,
    1986        1588 :                                             reinterpret_cast<GByte *>(pChunk) +
    1987        1588 :                                                 i * nChunkBandOffset,
    1988             :                                             &pDstBuffer, &eDstBufferDataType);
    1989        1588 :                         if (eErr == CE_None)
    1990             :                         {
    1991        1588 :                             eErr = poMEMBand->RasterIO(
    1992             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1993             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1994             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1995             :                                 eDstBufferDataType, 0, 0, nullptr);
    1996             :                         }
    1997        1588 :                         CPLFree(pDstBuffer);
    1998             :                     }
    1999             :                 }
    2000             : 
    2001         892 :                 nBlocksDone++;
    2002        1281 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    2003         389 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    2004             :                                              "", psExtraArg->pProgressData))
    2005             :                 {
    2006           0 :                     eErr = CE_Failure;
    2007             :                 }
    2008             :             }
    2009             :         }
    2010             : 
    2011         892 :         CPLFree(pChunk);
    2012         892 :         CPLFree(pabyChunkNoDataMask);
    2013             :     }
    2014             : 
    2015         892 :     if (pTempBuffer)
    2016             :     {
    2017           2 :         CPL_IGNORE_RET_VAL(poMEMDS->RasterIO(
    2018             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    2019             :             pData, nBufXSize, nBufYSize, eBufType, nBandCount, nullptr,
    2020             :             nPixelSpace, nLineSpace, nBandSpace, nullptr));
    2021             :     }
    2022             : 
    2023         892 :     return eErr;
    2024             : }
    2025             : 
    2026             : //! @endcond
    2027             : 
    2028             : /************************************************************************/
    2029             : /*                           GDALSwapWords()                            */
    2030             : /************************************************************************/
    2031             : 
    2032             : /**
    2033             :  * Byte swap words in-place.
    2034             :  *
    2035             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2036             :  * a memory array.  No assumption is made that the words being swapped are
    2037             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2038             :  * to determine if the current platform is big endian or little endian.  Use
    2039             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2040             :  * of a function call.
    2041             :  *
    2042             :  * @param pData pointer to start of data buffer.
    2043             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2044             :  * @param nWordCount the number of words to be swapped in this call.
    2045             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2046             :  * the next. For packed buffers this is the same as nWordSize.
    2047             :  */
    2048             : 
    2049      497149 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2050             :                                int nWordSkip)
    2051             : 
    2052             : {
    2053      497149 :     if (nWordCount > 0)
    2054      497149 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2055             : 
    2056      497149 :     GByte *pabyData = static_cast<GByte *>(pData);
    2057             : 
    2058      497149 :     switch (nWordSize)
    2059             :     {
    2060        7234 :         case 1:
    2061        7234 :             break;
    2062             : 
    2063      476905 :         case 2:
    2064      476905 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2065   228062000 :             for (int i = 0; i < nWordCount; i++)
    2066             :             {
    2067   227585000 :                 CPL_SWAP16PTR(pabyData);
    2068   227585000 :                 pabyData += nWordSkip;
    2069             :             }
    2070      476905 :             break;
    2071             : 
    2072       10584 :         case 4:
    2073       10584 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2074       10584 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2075             :             {
    2076    29140600 :                 for (int i = 0; i < nWordCount; i++)
    2077             :                 {
    2078    29130000 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2079             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2080    29130000 :                     pabyData += nWordSkip;
    2081       10581 :                 }
    2082             :             }
    2083             :             else
    2084             :             {
    2085           9 :                 for (int i = 0; i < nWordCount; i++)
    2086             :                 {
    2087           6 :                     CPL_SWAP32PTR(pabyData);
    2088           6 :                     pabyData += nWordSkip;
    2089             :                 }
    2090             :             }
    2091       10584 :             break;
    2092             : 
    2093        2426 :         case 8:
    2094        2426 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2095        2426 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2096             :             {
    2097     3356900 :                 for (int i = 0; i < nWordCount; i++)
    2098             :                 {
    2099     3354480 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2100             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2101     3354480 :                     pabyData += nWordSkip;
    2102        2425 :                 }
    2103             :             }
    2104             :             else
    2105             :             {
    2106           3 :                 for (int i = 0; i < nWordCount; i++)
    2107             :                 {
    2108           2 :                     CPL_SWAP64PTR(pabyData);
    2109           2 :                     pabyData += nWordSkip;
    2110             :                 }
    2111             :             }
    2112        2426 :             break;
    2113             : 
    2114           0 :         default:
    2115           0 :             CPLAssert(false);
    2116             :     }
    2117             : }
    2118             : 
    2119             : /************************************************************************/
    2120             : /*                          GDALSwapWordsEx()                           */
    2121             : /************************************************************************/
    2122             : 
    2123             : /**
    2124             :  * Byte swap words in-place.
    2125             :  *
    2126             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2127             :  * a memory array.  No assumption is made that the words being swapped are
    2128             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2129             :  * to determine if the current platform is big endian or little endian.  Use
    2130             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2131             :  * of a function call.
    2132             :  *
    2133             :  * @param pData pointer to start of data buffer.
    2134             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2135             :  * @param nWordCount the number of words to be swapped in this call.
    2136             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2137             :  * the next. For packed buffers this is the same as nWordSize.
    2138             :  */
    2139        6130 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2140             :                                  int nWordSkip)
    2141             : {
    2142        6130 :     GByte *pabyData = static_cast<GByte *>(pData);
    2143       12260 :     while (nWordCount)
    2144             :     {
    2145             :         // Pick-up a multiple of 8 as max chunk size.
    2146        6130 :         const int nWordCountSmall =
    2147        6130 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2148        6130 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2149        6130 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2150        6130 :         nWordCount -= nWordCountSmall;
    2151             :     }
    2152        6130 : }
    2153             : 
    2154             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2155             : namespace
    2156             : {
    2157             : 
    2158             : /************************************************************************/
    2159             : /*                           GDALCopyWordsT()                           */
    2160             : /************************************************************************/
    2161             : /**
    2162             :  * Template function, used to copy data from pSrcData into buffer
    2163             :  * pDstData, with stride nSrcPixelStride in the source data and
    2164             :  * stride nDstPixelStride in the destination data. This template can
    2165             :  * deal with the case where the input data type is real or complex and
    2166             :  * the output is real.
    2167             :  *
    2168             :  * @param pSrcData the source data buffer
    2169             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2170             :  *                      of interest.
    2171             :  * @param pDstData the destination buffer.
    2172             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2173             :  *                      interest.
    2174             :  * @param nWordCount the total number of pixel words to copy
    2175             :  *
    2176             :  * @code
    2177             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2178             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2179             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2180             :  * @endcode
    2181             :  * @note
    2182             :  * This is a private function, and should not be exposed outside of
    2183             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2184             :  */
    2185             : 
    2186             : template <class Tin, class Tout>
    2187    49013857 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2188             :                                          int nSrcPixelStride,
    2189             :                                          Tout *const CPL_RESTRICT pDstData,
    2190             :                                          int nDstPixelStride,
    2191             :                                          GPtrDiff_t nWordCount)
    2192             : {
    2193    49013857 :     decltype(nWordCount) nDstOffset = 0;
    2194             : 
    2195    49013857 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2196    49013857 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2197   356655113 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2198             :     {
    2199   307641208 :         const Tin tValue =
    2200   307641208 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2201   307641208 :         Tout *const pOutPixel =
    2202   307641208 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2203             : 
    2204   307641208 :         GDALCopyWord(tValue, *pOutPixel);
    2205             : 
    2206   307641208 :         nDstOffset += nDstPixelStride;
    2207             :     }
    2208    49013857 : }
    2209             : 
    2210             : template <class Tin, class Tout>
    2211    29776660 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2212             :                                         int nSrcPixelStride,
    2213             :                                         Tout *const CPL_RESTRICT pDstData,
    2214             :                                         int nDstPixelStride,
    2215             :                                         GPtrDiff_t nWordCount)
    2216             : {
    2217    29776660 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2218             :                           nWordCount);
    2219    29776660 : }
    2220             : 
    2221             : template <class Tin, class Tout>
    2222     5080936 : static void inline GDALCopyWordsT_8atatime(
    2223             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2224             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2225             :     GPtrDiff_t nWordCount)
    2226             : {
    2227     5080936 :     decltype(nWordCount) nDstOffset = 0;
    2228             : 
    2229     5080936 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2230     5080936 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2231     5080936 :     decltype(nWordCount) n = 0;
    2232     5080936 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2233             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2234             :     {
    2235    52932327 :         for (; n < nWordCount - 7; n += 8)
    2236             :         {
    2237    52390796 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2238    52390796 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2239    52390796 :             Tout *const pOutPixels =
    2240    52390796 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2241             : 
    2242    52390796 :             GDALCopy8Words(pInValues, pOutPixels);
    2243             : 
    2244    52390796 :             nDstOffset += 8 * nDstPixelStride;
    2245             :         }
    2246             :     }
    2247    10465999 :     for (; n < nWordCount; n++)
    2248             :     {
    2249     5385053 :         const Tin tValue =
    2250     5385053 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2251     5385053 :         Tout *const pOutPixel =
    2252     5385053 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2253             : 
    2254     5385053 :         GDALCopyWord(tValue, *pOutPixel);
    2255             : 
    2256     5385053 :         nDstOffset += nDstPixelStride;
    2257             :     }
    2258     5080936 : }
    2259             : 
    2260             : #ifdef HAVE_SSE2
    2261             : 
    2262             : template <class Tout>
    2263     1042126 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2264             :                               int nSrcPixelStride,
    2265             :                               Tout *const CPL_RESTRICT pDstData,
    2266             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2267             : {
    2268             :     static_assert(std::is_integral<Tout>::value &&
    2269             :                       sizeof(Tout) == sizeof(uint16_t),
    2270             :                   "Bad Tout");
    2271     1042126 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2272             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2273             :     {
    2274       35752 :         decltype(nWordCount) n = 0;
    2275       35752 :         const __m128i xmm_zero = _mm_setzero_si128();
    2276       35752 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2277             :             reinterpret_cast<GByte *>(pDstData);
    2278     1478148 :         for (; n < nWordCount - 15; n += 16)
    2279             :         {
    2280     1442396 :             __m128i xmm = _mm_loadu_si128(
    2281     1442396 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2282     1442396 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2283     1442396 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2284             :             _mm_storeu_si128(
    2285     1442396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2286             :             _mm_storeu_si128(
    2287     1442396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2288             :         }
    2289      111662 :         for (; n < nWordCount; n++)
    2290             :         {
    2291       75910 :             pDstData[n] = pSrcData[n];
    2292       35752 :         }
    2293             :     }
    2294             :     else
    2295             :     {
    2296     1006371 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2297             :                               nDstPixelStride, nWordCount);
    2298             :     }
    2299     1042126 : }
    2300             : 
    2301             : template <>
    2302     1029400 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2303             :                                  int nSrcPixelStride,
    2304             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    2305             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2306             : {
    2307     1029400 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2308             :                              nDstPixelStride, nWordCount);
    2309     1029400 : }
    2310             : 
    2311             : template <>
    2312       12726 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2313             :                                  int nSrcPixelStride,
    2314             :                                  GInt16 *const CPL_RESTRICT pDstData,
    2315             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2316             : {
    2317       12726 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2318             :                              nDstPixelStride, nWordCount);
    2319       12726 : }
    2320             : 
    2321             : template <class Tout>
    2322    16237176 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2323             :                               int nSrcPixelStride,
    2324             :                               Tout *const CPL_RESTRICT pDstData,
    2325             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2326             : {
    2327             :     static_assert(std::is_integral<Tout>::value &&
    2328             :                       sizeof(Tout) == sizeof(uint32_t),
    2329             :                   "Bad Tout");
    2330    16237176 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2331             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2332             :     {
    2333     6532686 :         decltype(nWordCount) n = 0;
    2334     6532686 :         const __m128i xmm_zero = _mm_setzero_si128();
    2335     6532686 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2336             :             reinterpret_cast<GByte *>(pDstData);
    2337    74248227 :         for (; n < nWordCount - 15; n += 16)
    2338             :         {
    2339    67715461 :             __m128i xmm = _mm_loadu_si128(
    2340    67715461 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2341    67715461 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2342    67715461 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2343    67715461 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2344    67715461 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2345    67715461 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2346    67715461 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2347             :             _mm_storeu_si128(
    2348    67715461 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2349             :             _mm_storeu_si128(
    2350    67715461 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2351             :             _mm_storeu_si128(
    2352    67715461 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2353             :             _mm_storeu_si128(
    2354    67715461 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2355             :         }
    2356    14826316 :         for (; n < nWordCount; n++)
    2357             :         {
    2358     8293640 :             pDstData[n] = pSrcData[n];
    2359     6532686 :         }
    2360             :     }
    2361             :     else
    2362             :     {
    2363     9704510 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2364             :                               nDstPixelStride, nWordCount);
    2365             :     }
    2366    16237176 : }
    2367             : 
    2368             : template <>
    2369         476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2370             :                                  int nSrcPixelStride,
    2371             :                                  GUInt32 *const CPL_RESTRICT pDstData,
    2372             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2373             : {
    2374         476 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2375             :                              nDstPixelStride, nWordCount);
    2376         476 : }
    2377             : 
    2378             : template <>
    2379    16236700 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2380             :                                  int nSrcPixelStride,
    2381             :                                  GInt32 *const CPL_RESTRICT pDstData,
    2382             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2383             : {
    2384    16236700 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2385             :                              nDstPixelStride, nWordCount);
    2386    16236700 : }
    2387             : 
    2388             : template <>
    2389     2851070 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2390             :                                  int nSrcPixelStride,
    2391             :                                  float *const CPL_RESTRICT pDstData,
    2392             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2393             : {
    2394     2851070 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2395             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2396             :     {
    2397      228189 :         decltype(nWordCount) n = 0;
    2398      228189 :         const __m128i xmm_zero = _mm_setzero_si128();
    2399      228189 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2400             :             reinterpret_cast<GByte *>(pDstData);
    2401     2267160 :         for (; n < nWordCount - 15; n += 16)
    2402             :         {
    2403     2038970 :             __m128i xmm = _mm_loadu_si128(
    2404     2038970 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2405     2038970 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2406     2038970 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2407     2038970 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2408     2038970 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2409     2038970 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2410     2038970 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2411     2038970 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2412     2038970 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2413     2038970 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2414     2038970 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2415     2038970 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2416             :                           xmm0_f);
    2417             :             _mm_storeu_ps(
    2418     2038970 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2419             :             _mm_storeu_ps(
    2420     2038970 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2421             :             _mm_storeu_ps(
    2422     2038970 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2423             :         }
    2424      951437 :         for (; n < nWordCount; n++)
    2425             :         {
    2426      723248 :             pDstData[n] = pSrcData[n];
    2427      228189 :         }
    2428             :     }
    2429             :     else
    2430             :     {
    2431     2622880 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2432             :                               nDstPixelStride, nWordCount);
    2433             :     }
    2434     2851070 : }
    2435             : 
    2436             : template <>
    2437      170938 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2438             :                                  int nSrcPixelStride,
    2439             :                                  double *const CPL_RESTRICT pDstData,
    2440             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2441             : {
    2442      170938 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2443             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2444             :     {
    2445      147140 :         decltype(nWordCount) n = 0;
    2446      147140 :         const __m128i xmm_zero = _mm_setzero_si128();
    2447      147140 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2448             :             reinterpret_cast<GByte *>(pDstData);
    2449     3127410 :         for (; n < nWordCount - 15; n += 16)
    2450             :         {
    2451     2980270 :             __m128i xmm = _mm_loadu_si128(
    2452     2980270 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2453     2980270 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2454     2980270 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2455     2980270 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2456     2980270 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2457     2980270 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2458     2980270 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2459             : 
    2460             : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
    2461             :             _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2462             :                              _mm256_cvtepi32_pd(xmm0));
    2463             :             _mm256_storeu_pd(
    2464             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2465             :                 _mm256_cvtepi32_pd(xmm1));
    2466             :             _mm256_storeu_pd(
    2467             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2468             :                 _mm256_cvtepi32_pd(xmm2));
    2469             :             _mm256_storeu_pd(
    2470             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2471             :                 _mm256_cvtepi32_pd(xmm3));
    2472             : #else
    2473     2980270 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2474     2980270 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2475     2980270 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2476     2980270 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2477     2980270 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2478     2980270 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2479     2980270 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2480     2980270 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2481     2980270 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2482     2980270 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2483     2980270 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2484     2980270 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2485             : 
    2486     2980270 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2487             :                           xmm0_low_d);
    2488             :             _mm_storeu_pd(
    2489     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2490             :                 xmm0_high_d);
    2491             :             _mm_storeu_pd(
    2492     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2493             :                 xmm1_low_d);
    2494             :             _mm_storeu_pd(
    2495     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2496             :                 xmm1_high_d);
    2497             :             _mm_storeu_pd(
    2498     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2499             :                 xmm2_low_d);
    2500             :             _mm_storeu_pd(
    2501     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2502             :                 xmm2_high_d);
    2503             :             _mm_storeu_pd(
    2504     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2505             :                 xmm3_low_d);
    2506             :             _mm_storeu_pd(
    2507     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2508             :                 xmm3_high_d);
    2509             : #endif
    2510             :         }
    2511      280823 :         for (; n < nWordCount; n++)
    2512             :         {
    2513      133683 :             pDstData[n] = pSrcData[n];
    2514      147140 :         }
    2515             :     }
    2516             :     else
    2517             :     {
    2518       23798 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2519             :                               nDstPixelStride, nWordCount);
    2520             :     }
    2521      170938 : }
    2522             : 
    2523             : template <>
    2524         148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
    2525             :                                  int nSrcPixelStride,
    2526             :                                  int8_t *const CPL_RESTRICT pDstData,
    2527             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2528             : {
    2529         148 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2530             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2531             :     {
    2532         142 :         decltype(nWordCount) n = 0;
    2533         142 :         const __m128i xmm_127 = _mm_set1_epi8(127);
    2534         146 :         for (; n < nWordCount - 31; n += 32)
    2535             :         {
    2536           8 :             __m128i xmm0 = _mm_loadu_si128(
    2537           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2538           4 :             __m128i xmm1 = _mm_loadu_si128(
    2539           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2540           4 :             xmm0 = _mm_min_epu8(xmm0, xmm_127);
    2541           4 :             xmm1 = _mm_min_epu8(xmm1, xmm_127);
    2542           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2543           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2544             :                              xmm1);
    2545             :         }
    2546        2424 :         for (; n < nWordCount; n++)
    2547             :         {
    2548        2282 :             pDstData[n] =
    2549        2282 :                 pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
    2550         142 :         }
    2551             :     }
    2552             :     else
    2553             :     {
    2554           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2555             :                               nDstPixelStride, nWordCount);
    2556             :     }
    2557         148 : }
    2558             : 
    2559             : template <>
    2560          62 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
    2561             :                                  int nSrcPixelStride,
    2562             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2563             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2564             : {
    2565          62 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2566             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2567             :     {
    2568          56 :         decltype(nWordCount) n = 0;
    2569             : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
    2570          56 :         const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
    2571             : #endif
    2572         117 :         for (; n < nWordCount - 31; n += 32)
    2573             :         {
    2574         122 :             __m128i xmm0 = _mm_loadu_si128(
    2575          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2576          61 :             __m128i xmm1 = _mm_loadu_si128(
    2577          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2578             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2579             :             xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
    2580             :             xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
    2581             : #else
    2582          61 :             xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
    2583          61 :             xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
    2584          61 :             xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
    2585          61 :             xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
    2586          61 :             xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
    2587          61 :             xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
    2588             : #endif
    2589          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2590          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2591             :                              xmm1);
    2592             :         }
    2593         352 :         for (; n < nWordCount; n++)
    2594             :         {
    2595         296 :             pDstData[n] =
    2596         296 :                 pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
    2597          56 :         }
    2598             :     }
    2599             :     else
    2600             :     {
    2601           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2602             :                               nDstPixelStride, nWordCount);
    2603             :     }
    2604          62 : }
    2605             : 
    2606             : template <>
    2607        6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2608             :                                  int nSrcPixelStride,
    2609             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2610             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2611             : {
    2612        6037 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2613             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2614             :     {
    2615        5062 :         decltype(nWordCount) n = 0;
    2616             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2617             :         const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2618             : #else
    2619             :         // In SSE2, min_epu16 does not exist, so shift from
    2620             :         // UInt16 to SInt16 to be able to use min_epi16
    2621        5062 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2622        5062 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2623             : #endif
    2624       71888 :         for (; n < nWordCount - 15; n += 16)
    2625             :         {
    2626      133652 :             __m128i xmm0 = _mm_loadu_si128(
    2627       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2628       66826 :             __m128i xmm1 = _mm_loadu_si128(
    2629       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2630             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2631             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2632             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2633             : #else
    2634       66826 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2635       66826 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2636       66826 :             xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
    2637       66826 :             xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
    2638       66826 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2639       66826 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2640             : #endif
    2641       66826 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    2642       66826 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2643             :         }
    2644       16403 :         for (; n < nWordCount; n++)
    2645             :         {
    2646       11341 :             pDstData[n] =
    2647       11341 :                 pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
    2648        5062 :         }
    2649             :     }
    2650             :     else
    2651             :     {
    2652         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2653             :                               nDstPixelStride, nWordCount);
    2654             :     }
    2655        6037 : }
    2656             : 
    2657             : template <>
    2658          46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2659             :                                  int nSrcPixelStride,
    2660             :                                  int16_t *const CPL_RESTRICT pDstData,
    2661             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2662             : {
    2663          46 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2664             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2665             :     {
    2666          40 :         decltype(nWordCount) n = 0;
    2667             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2668             :         const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2669             : #else
    2670             :         // In SSE2, min_epu16 does not exist, so shift from
    2671             :         // UInt16 to SInt16 to be able to use min_epi16
    2672          40 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2673          40 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2674             : #endif
    2675         169 :         for (; n < nWordCount - 15; n += 16)
    2676             :         {
    2677         258 :             __m128i xmm0 = _mm_loadu_si128(
    2678         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2679         129 :             __m128i xmm1 = _mm_loadu_si128(
    2680         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2681             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2682             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2683             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2684             : #else
    2685         129 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2686         129 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2687         129 :             xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
    2688         129 :             xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
    2689         129 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2690         129 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2691             : #endif
    2692         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2693         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2694             :                              xmm1);
    2695             :         }
    2696         191 :         for (; n < nWordCount; n++)
    2697             :         {
    2698         282 :             pDstData[n] = pSrcData[n] >= 32767
    2699             :                               ? 32767
    2700         131 :                               : static_cast<int16_t>(pSrcData[n]);
    2701          40 :         }
    2702             :     }
    2703             :     else
    2704             :     {
    2705           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2706             :                               nDstPixelStride, nWordCount);
    2707             :     }
    2708          46 : }
    2709             : 
    2710             : template <>
    2711         136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2712             :                                  int nSrcPixelStride,
    2713             :                                  uint16_t *const CPL_RESTRICT pDstData,
    2714             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2715             : {
    2716         136 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2717             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2718             :     {
    2719          93 :         decltype(nWordCount) n = 0;
    2720          93 :         const __m128i xmm_zero = _mm_setzero_si128();
    2721         278 :         for (; n < nWordCount - 15; n += 16)
    2722             :         {
    2723         370 :             __m128i xmm0 = _mm_loadu_si128(
    2724         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2725         185 :             __m128i xmm1 = _mm_loadu_si128(
    2726         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2727         185 :             xmm0 = _mm_max_epi16(xmm0, xmm_zero);
    2728         185 :             xmm1 = _mm_max_epi16(xmm1, xmm_zero);
    2729         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2730         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2731             :                              xmm1);
    2732             :         }
    2733         471 :         for (; n < nWordCount; n++)
    2734             :         {
    2735         378 :             pDstData[n] =
    2736         378 :                 pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
    2737          93 :         }
    2738             :     }
    2739             :     else
    2740             :     {
    2741          43 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2742             :                               nDstPixelStride, nWordCount);
    2743             :     }
    2744         136 : }
    2745             : 
    2746             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2747             : 
    2748             : template <>
    2749             : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
    2750             :                                  int nSrcPixelStride,
    2751             :                                  int32_t *const CPL_RESTRICT pDstData,
    2752             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2753             : {
    2754             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2755             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2756             :     {
    2757             :         decltype(nWordCount) n = 0;
    2758             :         const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
    2759             :         for (; n < nWordCount - 8; n += 7)
    2760             :         {
    2761             :             __m128i xmm0 = _mm_loadu_si128(
    2762             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2763             :             __m128i xmm1 = _mm_loadu_si128(
    2764             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2765             :             xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
    2766             :             xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
    2767             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2768             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2769             :                              xmm1);
    2770             :         }
    2771             :         for (; n < nWordCount; n++)
    2772             :         {
    2773             :             pDstData[n] = pSrcData[n] >= INT_MAX
    2774             :                               ? INT_MAX
    2775             :                               : static_cast<int32_t>(pSrcData[n]);
    2776             :         }
    2777             :     }
    2778             :     else
    2779             :     {
    2780             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2781             :                               nDstPixelStride, nWordCount);
    2782             :     }
    2783             : }
    2784             : 
    2785             : template <>
    2786             : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    2787             :                                  int nSrcPixelStride,
    2788             :                                  uint32_t *const CPL_RESTRICT pDstData,
    2789             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2790             : {
    2791             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2792             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2793             :     {
    2794             :         decltype(nWordCount) n = 0;
    2795             :         const __m128i xmm_zero = _mm_setzero_si128();
    2796             :         for (; n < nWordCount - 7; n += 8)
    2797             :         {
    2798             :             __m128i xmm0 = _mm_loadu_si128(
    2799             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2800             :             __m128i xmm1 = _mm_loadu_si128(
    2801             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2802             :             xmm0 = _mm_max_epi32(xmm0, xmm_zero);
    2803             :             xmm1 = _mm_max_epi32(xmm1, xmm_zero);
    2804             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2805             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2806             :                              xmm1);
    2807             :         }
    2808             :         for (; n < nWordCount; n++)
    2809             :         {
    2810             :             pDstData[n] =
    2811             :                 pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
    2812             :         }
    2813             :     }
    2814             :     else
    2815             :     {
    2816             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2817             :                               nDstPixelStride, nWordCount);
    2818             :     }
    2819             : }
    2820             : 
    2821             : #endif  // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2822             : 
    2823             : template <>
    2824         403 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2825             :                                  int nSrcPixelStride,
    2826             :                                  float *const CPL_RESTRICT pDstData,
    2827             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2828             : {
    2829         403 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2830             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2831             :     {
    2832         397 :         decltype(nWordCount) n = 0;
    2833         397 :         const __m128i xmm_zero = _mm_setzero_si128();
    2834         397 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2835             :             reinterpret_cast<GByte *>(pDstData);
    2836        1688 :         for (; n < nWordCount - 7; n += 8)
    2837             :         {
    2838        1291 :             __m128i xmm = _mm_loadu_si128(
    2839        1291 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2840        1291 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2841        1291 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2842        1291 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2843        1291 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2844        1291 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2845             :                           xmm0_f);
    2846             :             _mm_storeu_ps(
    2847        1291 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2848             :         }
    2849        1415 :         for (; n < nWordCount; n++)
    2850             :         {
    2851        1018 :             pDstData[n] = pSrcData[n];
    2852         397 :         }
    2853             :     }
    2854             :     else
    2855             :     {
    2856           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2857             :                               nDstPixelStride, nWordCount);
    2858             :     }
    2859         403 : }
    2860             : 
    2861             : template <>
    2862     1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2863             :                                  int nSrcPixelStride,
    2864             :                                  float *const CPL_RESTRICT pDstData,
    2865             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2866             : {
    2867     1076640 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2868             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2869             :     {
    2870       86742 :         decltype(nWordCount) n = 0;
    2871       86742 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2872             :             reinterpret_cast<GByte *>(pDstData);
    2873      586119 :         for (; n < nWordCount - 7; n += 8)
    2874             :         {
    2875      499377 :             __m128i xmm = _mm_loadu_si128(
    2876      499377 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2877      499377 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2878      499377 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2879      499377 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2880      499377 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2881      499377 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2882      499377 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2883             :                           xmm0_f);
    2884             :             _mm_storeu_ps(
    2885      499377 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2886             :         }
    2887      253882 :         for (; n < nWordCount; n++)
    2888             :         {
    2889      167140 :             pDstData[n] = pSrcData[n];
    2890       86742 :         }
    2891             :     }
    2892             :     else
    2893             :     {
    2894      989901 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2895             :                               nDstPixelStride, nWordCount);
    2896             :     }
    2897     1076640 : }
    2898             : 
    2899             : template <>
    2900         449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2901             :                                  int nSrcPixelStride,
    2902             :                                  double *const CPL_RESTRICT pDstData,
    2903             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2904             : {
    2905         449 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2906             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2907             :     {
    2908         313 :         decltype(nWordCount) n = 0;
    2909         313 :         const __m128i xmm_zero = _mm_setzero_si128();
    2910         313 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2911             :             reinterpret_cast<GByte *>(pDstData);
    2912         829 :         for (; n < nWordCount - 7; n += 8)
    2913             :         {
    2914         516 :             __m128i xmm = _mm_loadu_si128(
    2915         516 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2916         516 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2917         516 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2918             : 
    2919         516 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2920         516 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2921         516 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2922         516 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2923         516 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2924         516 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2925             : 
    2926         516 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2927             :                           xmm0_low_d);
    2928             :             _mm_storeu_pd(
    2929         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2930             :                 xmm0_high_d);
    2931             :             _mm_storeu_pd(
    2932         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2933             :                 xmm1_low_d);
    2934             :             _mm_storeu_pd(
    2935         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2936             :                 xmm1_high_d);
    2937             :         }
    2938        1082 :         for (; n < nWordCount; n++)
    2939             :         {
    2940         769 :             pDstData[n] = pSrcData[n];
    2941         313 :         }
    2942             :     }
    2943             :     else
    2944             :     {
    2945         136 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2946             :                               nDstPixelStride, nWordCount);
    2947             :     }
    2948         449 : }
    2949             : 
    2950             : template <>
    2951     4923280 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2952             :                                  int nSrcPixelStride,
    2953             :                                  double *const CPL_RESTRICT pDstData,
    2954             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2955             : {
    2956     4923280 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2957             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2958             :     {
    2959       34874 :         decltype(nWordCount) n = 0;
    2960       34874 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2961             :             reinterpret_cast<GByte *>(pDstData);
    2962      403828 :         for (; n < nWordCount - 7; n += 8)
    2963             :         {
    2964      368954 :             __m128i xmm = _mm_loadu_si128(
    2965      368954 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2966      368954 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2967      368954 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2968      368954 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2969             : 
    2970      368954 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2971      368954 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2972      368954 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2973      368954 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2974      368954 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2975      368954 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2976             : 
    2977      368954 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2978             :                           xmm0_low_d);
    2979             :             _mm_storeu_pd(
    2980      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2981             :                 xmm0_high_d);
    2982             :             _mm_storeu_pd(
    2983      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2984             :                 xmm1_low_d);
    2985             :             _mm_storeu_pd(
    2986      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2987             :                 xmm1_high_d);
    2988             :         }
    2989      255934 :         for (; n < nWordCount; n++)
    2990             :         {
    2991      221060 :             pDstData[n] = pSrcData[n];
    2992       34874 :         }
    2993             :     }
    2994             :     else
    2995             :     {
    2996     4888400 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2997             :                               nDstPixelStride, nWordCount);
    2998             :     }
    2999     4923280 : }
    3000             : 
    3001             : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
    3002             : 
    3003             : #if defined(HAVE_AVX2_DISPATCH)
    3004             : #if !defined(_MSC_VER)
    3005             : __attribute__((target("avx2")))
    3006             : #endif
    3007       12723 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
    3008             :                                            uint8_t *CPL_RESTRICT pDst,
    3009             :                                            GPtrDiff_t nWordCount)
    3010             : {
    3011       12723 :     const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
    3012       12723 :     GPtrDiff_t n = 0;
    3013      958119 :     for (; n < nWordCount - 31; n += 32)
    3014             :     {
    3015             :         __m256i v0 =
    3016      945396 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
    3017             :         __m256i v1 =
    3018      945396 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
    3019      945396 :         __m256i v2 = _mm256_loadu_si256(
    3020      945396 :             reinterpret_cast<const __m256i *>(pSrc + n + 16));
    3021      945396 :         __m256i v3 = _mm256_loadu_si256(
    3022      945396 :             reinterpret_cast<const __m256i *>(pSrc + n + 24));
    3023             :         // Clamp to [0, 255]
    3024             :         // Pack int32 -> int16 -> uint8, then fix cross-lane ordering
    3025      945396 :         __m256i ab16 = _mm256_packs_epi32(v0, v1);
    3026      945396 :         __m256i cd16 = _mm256_packs_epi32(v2, v3);
    3027      945396 :         __m256i bytes = _mm256_packus_epi16(ab16, cd16);
    3028      945396 :         bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
    3029      945396 :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
    3030             :     }
    3031       68589 :     for (; n < nWordCount; n++)
    3032             :     {
    3033       70955 :         pDst[n] = pSrc[n] <= 0     ? 0
    3034       15089 :                   : pSrc[n] >= 255 ? 255
    3035        1075 :                                    : static_cast<uint8_t>(pSrc[n]);
    3036             :     }
    3037       12723 : }
    3038             : 
    3039             : #if !defined(_MSC_VER)
    3040             : __attribute__((target("avx2")))
    3041             : #endif
    3042       10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
    3043             :                                             uint16_t *CPL_RESTRICT pDst,
    3044             :                                             GPtrDiff_t nWordCount)
    3045             : {
    3046             :     // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
    3047             :     //   [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
    3048             :     // Permute to deinterleave: all v0 values first, then all v1 values
    3049       10277 :     const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
    3050       10277 :     GPtrDiff_t n = 0;
    3051      670572 :     for (; n < nWordCount - 15; n += 16)
    3052             :     {
    3053             :         __m256i v0 =
    3054      660295 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
    3055             :         __m256i v1 =
    3056     1320590 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
    3057             :         // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
    3058      660295 :         __m256i packed = _mm256_packus_epi32(v0, v1);
    3059             :         // Fix cross-lane interleave from packus
    3060      660295 :         packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
    3061      660295 :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
    3062             :     }
    3063      163928 :     for (; n < nWordCount; n++)
    3064             :     {
    3065      307282 :         pDst[n] = pSrc[n] <= 0       ? 0
    3066      153631 :                   : pSrc[n] >= 65535 ? 65535
    3067      153599 :                                      : static_cast<uint16_t>(pSrc[n]);
    3068             :     }
    3069       10277 : }
    3070             : #endif  // HAVE_AVX2_DISPATCH
    3071             : 
    3072             : // ---- int32 -> uint8 with clamping to [0, 255] ----
    3073             : template <>
    3074       12837 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3075             :                                  int nSrcPixelStride,
    3076             :                                  uint8_t *const CPL_RESTRICT pDstData,
    3077             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3078             : {
    3079       12837 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3080             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3081             :     {
    3082             : #if defined(HAVE_AVX2_DISPATCH)
    3083       12723 :         if (CPLHaveRuntimeAVX2())
    3084             :         {
    3085       12723 :             GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
    3086       12723 :             return;
    3087             :         }
    3088             : #endif
    3089             : #ifdef HAVE_SSE2
    3090             :         // SSE2 path: 16 pixels per iteration
    3091           0 :         decltype(nWordCount) n = 0;
    3092           0 :         for (; n < nWordCount - 15; n += 16)
    3093             :         {
    3094           0 :             __m128i v0 = _mm_loadu_si128(
    3095           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3096           0 :             __m128i v1 = _mm_loadu_si128(
    3097           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3098           0 :             __m128i v2 = _mm_loadu_si128(
    3099           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    3100           0 :             __m128i v3 = _mm_loadu_si128(
    3101           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 12));
    3102             :             // Pack int32->int16 with signed saturation to [-32768,32767] range
    3103           0 :             __m128i lo16 = _mm_packs_epi32(v0, v1);
    3104           0 :             __m128i hi16 = _mm_packs_epi32(v2, v3);
    3105             :             // Pack int16->uint8 with unsigned saturation to [0,255] range
    3106           0 :             __m128i bytes = _mm_packus_epi16(lo16, hi16);
    3107           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
    3108             :         }
    3109           0 :         for (; n < nWordCount; n++)
    3110             : #else
    3111             :         for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3112             : #endif
    3113             :         {
    3114           0 :             pDstData[n] = pSrcData[n] <= 0 ? 0
    3115           0 :                           : pSrcData[n] >= 255
    3116             :                               ? 255
    3117           0 :                               : static_cast<uint8_t>(pSrcData[n]);
    3118           0 :         }
    3119             :     }
    3120             :     else
    3121             :     {
    3122         114 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3123             :                               nDstPixelStride, nWordCount);
    3124             :     }
    3125             : }
    3126             : 
    3127             : // ---- int32 -> uint16 with clamping to [0, 65535] ----
    3128             : template <>
    3129       10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3130             :                                  int nSrcPixelStride,
    3131             :                                  uint16_t *const CPL_RESTRICT pDstData,
    3132             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3133             : {
    3134       10322 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3135             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3136             :     {
    3137             : #if defined(HAVE_AVX2_DISPATCH)
    3138       10277 :         if (CPLHaveRuntimeAVX2())
    3139             :         {
    3140       10277 :             GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
    3141       10277 :             return;
    3142             :         }
    3143             : #endif
    3144           0 :         decltype(nWordCount) n = 0;
    3145             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    3146             :         // SSE4.1: _mm_packus_epi32 directly handles uint saturation
    3147             :         for (; n < nWordCount - 7; n += 8)
    3148             :         {
    3149             :             __m128i v0 = _mm_loadu_si128(
    3150             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3151             :             __m128i v1 = _mm_loadu_si128(
    3152             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3153             :             __m128i packed = _mm_packus_epi32(v0, v1);
    3154             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
    3155             :         }
    3156             : #else
    3157             :         // SSE2: clamp to [0, 65535], bias to signed range, pack, unbias
    3158           0 :         const __m128i xmm_65535 = _mm_set1_epi32(65535);
    3159           0 :         const __m128i xmm_bias32 = _mm_set1_epi32(32768);
    3160           0 :         const __m128i xmm_bias16 = _mm_set1_epi16(-32768);
    3161           0 :         for (; n < nWordCount - 7; n += 8)
    3162             :         {
    3163           0 :             __m128i v0 = _mm_loadu_si128(
    3164           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3165           0 :             __m128i v1 = _mm_loadu_si128(
    3166           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3167             :             // max(v, 0)
    3168           0 :             v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
    3169           0 :             v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
    3170             :             // min(v, 65535)
    3171           0 :             __m128i gt0 = _mm_cmpgt_epi32(v0, xmm_65535);
    3172           0 :             __m128i gt1 = _mm_cmpgt_epi32(v1, xmm_65535);
    3173           0 :             v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
    3174             :                               _mm_and_si128(gt0, xmm_65535));
    3175           0 :             v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
    3176             :                               _mm_and_si128(gt1, xmm_65535));
    3177             :             // Shift [0, 65535] -> [-32768, 32767] for _mm_packs_epi32
    3178           0 :             v0 = _mm_sub_epi32(v0, xmm_bias32);
    3179           0 :             v1 = _mm_sub_epi32(v1, xmm_bias32);
    3180           0 :             __m128i packed = _mm_packs_epi32(v0, v1);
    3181             :             // Shift back: sub_epi16(x, -32768) == add 32768 (mod 2^16)
    3182           0 :             packed = _mm_sub_epi16(packed, xmm_bias16);
    3183           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
    3184             :         }
    3185             : #endif
    3186           0 :         for (; n < nWordCount; n++)
    3187             :         {
    3188           0 :             pDstData[n] = pSrcData[n] <= 0 ? 0
    3189           0 :                           : pSrcData[n] >= 65535
    3190             :                               ? 65535
    3191           0 :                               : static_cast<uint16_t>(pSrcData[n]);
    3192           0 :         }
    3193             :     }
    3194             :     else
    3195             :     {
    3196          45 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3197             :                               nDstPixelStride, nWordCount);
    3198             :     }
    3199             : }
    3200             : 
    3201             : #endif  // HAVE_SSE2
    3202             : 
    3203             : template <>
    3204     4426980 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3205             :                                  int nSrcPixelStride,
    3206             :                                  GByte *const CPL_RESTRICT pDstData,
    3207             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3208             : {
    3209     4426980 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3210             :                             nDstPixelStride, nWordCount);
    3211     4426980 : }
    3212             : 
    3213             : template <>
    3214       38387 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3215             :                                  int nSrcPixelStride,
    3216             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3217             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3218             : {
    3219       38387 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3220             :                             nDstPixelStride, nWordCount);
    3221       38387 : }
    3222             : 
    3223             : template <>
    3224       55671 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3225             :                                  int nSrcPixelStride,
    3226             :                                  double *const CPL_RESTRICT pDstData,
    3227             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3228             : {
    3229       55671 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3230             :                             nDstPixelStride, nWordCount);
    3231       55671 : }
    3232             : 
    3233             : template <>
    3234      122846 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3235             :                                  int nSrcPixelStride,
    3236             :                                  float *const CPL_RESTRICT pDstData,
    3237             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3238             : {
    3239      122846 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3240             :                             nDstPixelStride, nWordCount);
    3241      122846 : }
    3242             : 
    3243             : template <>
    3244         412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3245             :                                  int nSrcPixelStride,
    3246             :                                  float *const CPL_RESTRICT pDstData,
    3247             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3248             : {
    3249         412 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3250             :                             nDstPixelStride, nWordCount);
    3251         412 : }
    3252             : 
    3253             : template <>
    3254         544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3255             :                                  int nSrcPixelStride,
    3256             :                                  double *const CPL_RESTRICT pDstData,
    3257             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3258             : {
    3259         544 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3260             :                             nDstPixelStride, nWordCount);
    3261         544 : }
    3262             : 
    3263             : template <>
    3264      314423 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3265             :                                  int nSrcPixelStride,
    3266             :                                  GByte *const CPL_RESTRICT pDstData,
    3267             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3268             : {
    3269      314423 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3270             :                             nDstPixelStride, nWordCount);
    3271      314423 : }
    3272             : 
    3273             : template <>
    3274          55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3275             :                                  int nSrcPixelStride,
    3276             :                                  GInt8 *const CPL_RESTRICT pDstData,
    3277             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3278             : {
    3279          55 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3280             :                             nDstPixelStride, nWordCount);
    3281          55 : }
    3282             : 
    3283             : template <>
    3284       15785 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3285             :                                  int nSrcPixelStride,
    3286             :                                  GInt16 *const CPL_RESTRICT pDstData,
    3287             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3288             : {
    3289       15785 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3290             :                             nDstPixelStride, nWordCount);
    3291       15785 : }
    3292             : 
    3293             : template <>
    3294       61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3295             :                                  int nSrcPixelStride,
    3296             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3297             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3298             : {
    3299       61713 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3300             :                             nDstPixelStride, nWordCount);
    3301       61713 : }
    3302             : 
    3303             : template <>
    3304       43985 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3305             :                                  int nSrcPixelStride,
    3306             :                                  GInt32 *const CPL_RESTRICT pDstData,
    3307             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3308             : {
    3309       43985 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3310             :                             nDstPixelStride, nWordCount);
    3311       43985 : }
    3312             : 
    3313             : template <>
    3314          72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3315             :                                  int nSrcPixelStride,
    3316             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3317             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3318             : {
    3319          72 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3320             :                             nDstPixelStride, nWordCount);
    3321          72 : }
    3322             : 
    3323             : template <>
    3324          63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3325             :                                  int nSrcPixelStride,
    3326             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3327             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3328             : {
    3329          63 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3330             :                             nDstPixelStride, nWordCount);
    3331          63 : }
    3332             : 
    3333             : /************************************************************************/
    3334             : /*                       GDALCopyWordsComplexT()                        */
    3335             : /************************************************************************/
    3336             : /**
    3337             :  * Template function, used to copy data from pSrcData into buffer
    3338             :  * pDstData, with stride nSrcPixelStride in the source data and
    3339             :  * stride nDstPixelStride in the destination data. Deals with the
    3340             :  * complex case, where input is complex and output is complex.
    3341             :  *
    3342             :  * @param pSrcData the source data buffer
    3343             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3344             :  *                      of interest.
    3345             :  * @param pDstData the destination buffer.
    3346             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3347             :  *                      interest.
    3348             :  * @param nWordCount the total number of pixel words to copy
    3349             :  *
    3350             :  */
    3351             : template <class Tin, class Tout>
    3352       98788 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    3353             :                                   int nSrcPixelStride,
    3354             :                                   Tout *const CPL_RESTRICT pDstData,
    3355             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    3356             : {
    3357       98788 :     decltype(nWordCount) nDstOffset = 0;
    3358       98788 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3359       98788 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3360             : 
    3361     5631239 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3362             :     {
    3363     5532446 :         const Tin *const pPixelIn =
    3364     5532446 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3365     5532446 :         Tout *const pPixelOut =
    3366     5532446 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3367             : 
    3368     5532446 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    3369     5532446 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    3370             : 
    3371     5532446 :         nDstOffset += nDstPixelStride;
    3372             :     }
    3373       98788 : }
    3374             : 
    3375             : /************************************************************************/
    3376             : /*                      GDALCopyWordsComplexOutT()                      */
    3377             : /************************************************************************/
    3378             : /**
    3379             :  * Template function, used to copy data from pSrcData into buffer
    3380             :  * pDstData, with stride nSrcPixelStride in the source data and
    3381             :  * stride nDstPixelStride in the destination data. Deals with the
    3382             :  * case where the value is real coming in, but complex going out.
    3383             :  *
    3384             :  * @param pSrcData the source data buffer
    3385             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3386             :  *                      of interest, in bytes.
    3387             :  * @param pDstData the destination buffer.
    3388             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3389             :  *                      interest, in bytes.
    3390             :  * @param nWordCount the total number of pixel words to copy
    3391             :  *
    3392             :  */
    3393             : template <class Tin, class Tout>
    3394        4762 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    3395             :                                      int nSrcPixelStride,
    3396             :                                      Tout *const CPL_RESTRICT pDstData,
    3397             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    3398             : {
    3399        4762 :     decltype(nWordCount) nDstOffset = 0;
    3400             : 
    3401        4762 :     const Tout tOutZero = static_cast<Tout>(0);
    3402             : 
    3403        4762 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3404        4762 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3405             : 
    3406     1190408 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3407             :     {
    3408     1185646 :         const Tin tValue =
    3409     1185646 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3410     1185646 :         Tout *const pPixelOut =
    3411     1185646 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3412     1185646 :         GDALCopyWord(tValue, *pPixelOut);
    3413             : 
    3414     1185646 :         pPixelOut[1] = tOutZero;
    3415             : 
    3416     1185646 :         nDstOffset += nDstPixelStride;
    3417             :     }
    3418        4762 : }
    3419             : 
    3420             : /************************************************************************/
    3421             : /*                         GDALCopyWordsFromT()                         */
    3422             : /************************************************************************/
    3423             : /**
    3424             :  * Template driver function. Given the input type T, call the appropriate
    3425             :  * GDALCopyWordsT function template for the desired output type. You should
    3426             :  * never call this function directly (call GDALCopyWords instead).
    3427             :  *
    3428             :  * @param pSrcData source data buffer
    3429             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    3430             :  * @param bInComplex input is complex
    3431             :  * @param pDstData destination data buffer
    3432             :  * @param eDstType destination data type
    3433             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    3434             :  * @param nWordCount number of pixel words to be copied
    3435             :  */
    3436             : template <class T>
    3437    61292925 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    3438             :                                int nSrcPixelStride, bool bInComplex,
    3439             :                                void *CPL_RESTRICT pDstData,
    3440             :                                GDALDataType eDstType, int nDstPixelStride,
    3441             :                                GPtrDiff_t nWordCount)
    3442             : {
    3443    61292925 :     switch (eDstType)
    3444             :     {
    3445     4785549 :         case GDT_UInt8:
    3446     4785549 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3447             :                            static_cast<unsigned char *>(pDstData),
    3448             :                            nDstPixelStride, nWordCount);
    3449     4785549 :             break;
    3450        1891 :         case GDT_Int8:
    3451        1891 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3452             :                            static_cast<signed char *>(pDstData),
    3453             :                            nDstPixelStride, nWordCount);
    3454        1891 :             break;
    3455     1143544 :         case GDT_UInt16:
    3456     1143544 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3457             :                            static_cast<unsigned short *>(pDstData),
    3458             :                            nDstPixelStride, nWordCount);
    3459     1143544 :             break;
    3460     4162728 :         case GDT_Int16:
    3461     4162728 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3462             :                            static_cast<short *>(pDstData), nDstPixelStride,
    3463             :                            nWordCount);
    3464     4162728 :             break;
    3465       23084 :         case GDT_UInt32:
    3466       23084 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3467             :                            static_cast<unsigned int *>(pDstData),
    3468             :                            nDstPixelStride, nWordCount);
    3469       23084 :             break;
    3470    29460249 :         case GDT_Int32:
    3471    29460249 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3472             :                            static_cast<int *>(pDstData), nDstPixelStride,
    3473             :                            nWordCount);
    3474    29460249 :             break;
    3475        1250 :         case GDT_UInt64:
    3476        1250 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3477             :                            static_cast<std::uint64_t *>(pDstData),
    3478             :                            nDstPixelStride, nWordCount);
    3479        1250 :             break;
    3480        5957 :         case GDT_Int64:
    3481        5957 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3482             :                            static_cast<std::int64_t *>(pDstData),
    3483             :                            nDstPixelStride, nWordCount);
    3484        5957 :             break;
    3485         999 :         case GDT_Float16:
    3486         999 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3487             :                            static_cast<GFloat16 *>(pDstData), nDstPixelStride,
    3488             :                            nWordCount);
    3489         999 :             break;
    3490     4216051 :         case GDT_Float32:
    3491     4216051 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3492             :                            static_cast<float *>(pDstData), nDstPixelStride,
    3493             :                            nWordCount);
    3494     4216051 :             break;
    3495    17387964 :         case GDT_Float64:
    3496    17387964 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3497             :                            static_cast<double *>(pDstData), nDstPixelStride,
    3498             :                            nWordCount);
    3499    17387964 :             break;
    3500       94424 :         case GDT_CInt16:
    3501       94424 :             if (bInComplex)
    3502             :             {
    3503       93170 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3504             :                                       static_cast<short *>(pDstData),
    3505             :                                       nDstPixelStride, nWordCount);
    3506             :             }
    3507             :             else  // input is not complex, so we need to promote to a complex
    3508             :                   // buffer
    3509             :             {
    3510        1254 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3511             :                                          static_cast<short *>(pDstData),
    3512             :                                          nDstPixelStride, nWordCount);
    3513             :             }
    3514       94424 :             break;
    3515        1349 :         case GDT_CInt32:
    3516        1349 :             if (bInComplex)
    3517             :             {
    3518         717 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3519             :                                       static_cast<int *>(pDstData),
    3520             :                                       nDstPixelStride, nWordCount);
    3521             :             }
    3522             :             else  // input is not complex, so we need to promote to a complex
    3523             :                   // buffer
    3524             :             {
    3525         632 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3526             :                                          static_cast<int *>(pDstData),
    3527             :                                          nDstPixelStride, nWordCount);
    3528             :             }
    3529        1349 :             break;
    3530         313 :         case GDT_CFloat16:
    3531         313 :             if (bInComplex)
    3532             :             {
    3533          48 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3534             :                                       static_cast<GFloat16 *>(pDstData),
    3535             :                                       nDstPixelStride, nWordCount);
    3536             :             }
    3537             :             else  // input is not complex, so we need to promote to a complex
    3538             :                   // buffer
    3539             :             {
    3540         265 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3541             :                                          static_cast<GFloat16 *>(pDstData),
    3542             :                                          nDstPixelStride, nWordCount);
    3543             :             }
    3544         313 :             break;
    3545        3924 :         case GDT_CFloat32:
    3546        3924 :             if (bInComplex)
    3547             :             {
    3548        3115 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3549             :                                       static_cast<float *>(pDstData),
    3550             :                                       nDstPixelStride, nWordCount);
    3551             :             }
    3552             :             else  // input is not complex, so we need to promote to a complex
    3553             :                   // buffer
    3554             :             {
    3555         809 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3556             :                                          static_cast<float *>(pDstData),
    3557             :                                          nDstPixelStride, nWordCount);
    3558             :             }
    3559        3924 :             break;
    3560        3540 :         case GDT_CFloat64:
    3561        3540 :             if (bInComplex)
    3562             :             {
    3563        1738 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3564             :                                       static_cast<double *>(pDstData),
    3565             :                                       nDstPixelStride, nWordCount);
    3566             :             }
    3567             :             else  // input is not complex, so we need to promote to a complex
    3568             :                   // buffer
    3569             :             {
    3570        1802 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3571             :                                          static_cast<double *>(pDstData),
    3572             :                                          nDstPixelStride, nWordCount);
    3573             :             }
    3574        3540 :             break;
    3575           0 :         case GDT_Unknown:
    3576             :         case GDT_TypeCount:
    3577           0 :             CPLAssert(false);
    3578             :     }
    3579    61292925 : }
    3580             : 
    3581             : }  // end anonymous namespace
    3582             : 
    3583             : /************************************************************************/
    3584             : /*                         GDALReplicateWord()                          */
    3585             : /************************************************************************/
    3586             : 
    3587             : template <class T>
    3588      600405 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    3589             :                                GPtrDiff_t nWordCount)
    3590             : {
    3591      600405 :     const T valSet = *static_cast<const T *>(pDstData);
    3592      600405 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    3593             :     {
    3594      570592 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    3595    31990099 :         while (nWordCount >= 4)
    3596             :         {
    3597    31419540 :             nWordCount -= 4;
    3598    31419540 :             pDstPtr[0] = valSet;
    3599    31419540 :             pDstPtr[1] = valSet;
    3600    31419540 :             pDstPtr[2] = valSet;
    3601    31419540 :             pDstPtr[3] = valSet;
    3602    31419540 :             pDstPtr += 4;
    3603             :         }
    3604     1476627 :         while (nWordCount > 0)
    3605             :         {
    3606      906035 :             --nWordCount;
    3607      906035 :             *pDstPtr = valSet;
    3608      906035 :             pDstPtr++;
    3609             :         }
    3610             :     }
    3611             :     else
    3612             :     {
    3613       29813 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    3614     1040984 :         while (nWordCount > 0)
    3615             :         {
    3616     1011171 :             --nWordCount;
    3617     1011171 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    3618     1011171 :             pabyDstPtr += nDstPixelStride;
    3619             :         }
    3620             :     }
    3621      600405 : }
    3622             : 
    3623     1068100 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    3624             :                               GDALDataType eSrcType,
    3625             :                               void *CPL_RESTRICT pDstData,
    3626             :                               GDALDataType eDstType, int nDstPixelStride,
    3627             :                               GPtrDiff_t nWordCount)
    3628             : {
    3629             :     /* -----------------------------------------------------------------------
    3630             :      */
    3631             :     /* Special case when the source data is always the same value */
    3632             :     /* (for VRTSourcedRasterBand::IRasterIO and
    3633             :      * VRTDerivedRasterBand::IRasterIO*/
    3634             :     /*  for example) */
    3635             :     /* -----------------------------------------------------------------------
    3636             :      */
    3637             :     // Let the general translation case do the necessary conversions
    3638             :     // on the first destination element.
    3639     1068100 :     GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    3640             : 
    3641             :     // Now copy the first element to the nWordCount - 1 following destination
    3642             :     // elements.
    3643     1068100 :     nWordCount--;
    3644     1068100 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    3645             : 
    3646     1068100 :     switch (eDstType)
    3647             :     {
    3648      467605 :         case GDT_UInt8:
    3649             :         case GDT_Int8:
    3650             :         {
    3651      467605 :             if (nDstPixelStride == 1)
    3652             :             {
    3653      369689 :                 if (nWordCount > 0)
    3654      369689 :                     memset(pabyDstWord,
    3655      369689 :                            *reinterpret_cast<const GByte *>(pDstData),
    3656             :                            nWordCount);
    3657             :             }
    3658             :             else
    3659             :             {
    3660       97916 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    3661    67566000 :                 while (nWordCount > 0)
    3662             :                 {
    3663    67468100 :                     --nWordCount;
    3664    67468100 :                     *pabyDstWord = valSet;
    3665    67468100 :                     pabyDstWord += nDstPixelStride;
    3666             :                 }
    3667             :             }
    3668      467605 :             break;
    3669             :         }
    3670             : 
    3671             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    3672             :     case enum_type:                                                            \
    3673             :     {                                                                          \
    3674             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    3675             :         break;                                                                 \
    3676             :     }
    3677             : 
    3678       34513 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    3679      202455 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    3680          74 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3681      301585 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3682          41 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3683        1072 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3684           2 :             CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
    3685       52858 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3686        7805 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3687             : 
    3688             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3689             :     case enum_type:                                                            \
    3690             :     {                                                                          \
    3691             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3692             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3693             :         while (nWordCount > 0)                                                 \
    3694             :         {                                                                      \
    3695             :             --nWordCount;                                                      \
    3696             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3697             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3698             :             pabyDstWord += nDstPixelStride;                                    \
    3699             :         }                                                                      \
    3700             :         break;                                                                 \
    3701             :     }
    3702             : 
    3703         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3704         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3705           6 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
    3706         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3707         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3708             : 
    3709           0 :         case GDT_Unknown:
    3710             :         case GDT_TypeCount:
    3711           0 :             CPLAssert(false);
    3712             :     }
    3713     1068100 : }
    3714             : 
    3715             : /************************************************************************/
    3716             : /*                          GDALUnrolledCopy()                          */
    3717             : /************************************************************************/
    3718             : 
    3719             : template <class T, int srcStride, int dstStride>
    3720             : #if defined(__GNUC__) && defined(__AVX2__)
    3721             : __attribute__((optimize("tree-vectorize")))
    3722             : #endif
    3723     3000525 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3724             :                                            const T *CPL_RESTRICT pSrc,
    3725             :                                            GPtrDiff_t nIters)
    3726             : {
    3727             : #if !(defined(__GNUC__) && defined(__AVX2__))
    3728     3000525 :     if (nIters >= 16)
    3729             :     {
    3730   132812687 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3731             :         {
    3732   129932845 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3733   129932845 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3734   129932845 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3735   129932845 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3736   129932845 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3737   129932845 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3738   129932845 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3739   129932845 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3740   129932845 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3741   129932845 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3742   129932845 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3743   129932845 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3744   129932845 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3745   129932845 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3746   129932845 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3747   129932845 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3748   129932845 :             pDest += 16 * dstStride;
    3749   129932845 :             pSrc += 16 * srcStride;
    3750             :         }
    3751     2879967 :         nIters = nIters % 16;
    3752             :     }
    3753             : #else
    3754             : #pragma GCC unroll 4
    3755             : #endif
    3756     5160769 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3757             :     {
    3758     2160243 :         pDest[i * dstStride] = *pSrc;
    3759     2160243 :         pSrc += srcStride;
    3760             :     }
    3761     3000525 : }
    3762             : 
    3763             : template <class T, int srcStride, int dstStride>
    3764     3000525 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3765             :                                     const T *CPL_RESTRICT pSrc,
    3766             :                                     GPtrDiff_t nIters)
    3767             : {
    3768     3000525 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3769     3000525 : }
    3770             : 
    3771             : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                \
    3772             :     (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
    3773             : 
    3774             : template <>
    3775             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3776             :                                    const GByte *CPL_RESTRICT pSrc,
    3777             :                                    GPtrDiff_t nIters)
    3778             : {
    3779             :     if (nIters > 16)
    3780             :     {
    3781             :         // The SSSE3 variant is slightly faster than what the gcc autovectorizer
    3782             :         // generates
    3783             :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3784             :     }
    3785             :     else
    3786             :     {
    3787             :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3788             :         {
    3789             :             pDest[i] = *pSrc;
    3790             :             pSrc += 3;
    3791             :         }
    3792             :     }
    3793             : }
    3794             : 
    3795             : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
    3796             : 
    3797             : template <>
    3798      354194 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3799             :                                    const GByte *CPL_RESTRICT pSrc,
    3800             :                                    GPtrDiff_t nIters)
    3801             : {
    3802      354194 :     decltype(nIters) i = 0;
    3803      354194 :     if (nIters > 16)
    3804             :     {
    3805      194667 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3806             :         // If we were sure that there would always be 1 trailing byte, we could
    3807             :         // check against nIters - 15
    3808     2988110 :         for (; i < nIters - 16; i += 16)
    3809             :         {
    3810             :             __m128i xmm0 =
    3811     2793440 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3812             :             __m128i xmm1 =
    3813     5586890 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3814             :             // Set higher 8bit of each int16 packed word to 0
    3815     2793440 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3816     2793440 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3817             :             // Pack int16 to uint8 and merge back both vector
    3818     2793440 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3819             : 
    3820             :             // Store result
    3821     2793440 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3822             : 
    3823     2793440 :             pSrc += 2 * 16;
    3824             :         }
    3825             :     }
    3826     4633800 :     for (; i < nIters; i++)
    3827             :     {
    3828     4279610 :         pDest[i] = *pSrc;
    3829     4279610 :         pSrc += 2;
    3830             :     }
    3831      354194 : }
    3832             : 
    3833           1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
    3834             :                                             const GByte *CPL_RESTRICT pSrc,
    3835             :                                             GPtrDiff_t nIters)
    3836             : {
    3837           1 :     decltype(nIters) i = 0;
    3838           1 :     const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
    3839             :     // If we were sure that there would always be 2 trailing bytes, we could
    3840             :     // check against nIters - 15
    3841           2 :     for (; i < nIters - 16; i += 16)
    3842             :     {
    3843             :         __m128i xmm0 =
    3844           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3845             :         __m128i xmm1 =
    3846           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3847             :         __m128i xmm2 =
    3848           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3849             : 
    3850           1 :         auto xmm_mask0 = xmm_mask_ori;
    3851           1 :         auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
    3852           1 :         auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
    3853             : 
    3854           1 :         auto xmm = _mm_and_si128(xmm0, xmm_mask0);
    3855           1 :         auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
    3856             : 
    3857           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3858           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3859           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3860           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3861           2 :         xmm_res1 = _mm_or_si128(
    3862             :             xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
    3863             : 
    3864           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3865           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3866           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3867           2 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3868           1 :         xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
    3869             : 
    3870           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3871           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3872           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3873           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3874           2 :         xmm_res1 = _mm_or_si128(
    3875             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
    3876             : 
    3877           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3878           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3879           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3880           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3881           3 :         xmm_res1 = _mm_or_si128(
    3882             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
    3883           1 :         xmm = _mm_or_si128(xmm, xmm_res1);
    3884             : 
    3885           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3886           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3887           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3888             : 
    3889           2 :         xmm = _mm_or_si128(xmm,
    3890             :                            _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
    3891             : 
    3892           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3893           2 :         xmm = _mm_or_si128(xmm,
    3894             :                            _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
    3895             : 
    3896           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3897           2 :         xmm = _mm_or_si128(xmm,
    3898             :                            _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
    3899             : 
    3900           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3901           2 :         xmm = _mm_or_si128(xmm,
    3902             :                            _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
    3903             : 
    3904           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3905           2 :         xmm = _mm_or_si128(xmm,
    3906             :                            _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
    3907             : 
    3908           1 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
    3909             : 
    3910           1 :         pSrc += 3 * 16;
    3911             :     }
    3912           2 :     for (; i < nIters; i++)
    3913             :     {
    3914           1 :         pDest[i] = *pSrc;
    3915           1 :         pSrc += 3;
    3916             :     }
    3917           1 : }
    3918             : 
    3919             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3920             : 
    3921             : template <>
    3922      192265 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3923             :                                    const GByte *CPL_RESTRICT pSrc,
    3924             :                                    GPtrDiff_t nIters)
    3925             : {
    3926      192265 :     if (nIters > 16)
    3927             :     {
    3928      186142 :         if (CPLHaveRuntimeSSSE3())
    3929             :         {
    3930      186141 :             GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3931             :         }
    3932             :         else
    3933             :         {
    3934           1 :             GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    3935             :         }
    3936             :     }
    3937             :     else
    3938             :     {
    3939       20384 :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3940             :         {
    3941       14261 :             pDest[i] = *pSrc;
    3942       14261 :             pSrc += 3;
    3943             :         }
    3944             :     }
    3945      192265 : }
    3946             : 
    3947             : #else
    3948             : 
    3949             : template <>
    3950             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3951             :                                    const GByte *CPL_RESTRICT pSrc,
    3952             :                                    GPtrDiff_t nIters)
    3953             : {
    3954             :     GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    3955             : }
    3956             : #endif
    3957             : 
    3958             : template <>
    3959      332655 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    3960             :                                    const GByte *CPL_RESTRICT pSrc,
    3961             :                                    GPtrDiff_t nIters)
    3962             : {
    3963      332655 :     decltype(nIters) i = 0;
    3964      332655 :     if (nIters > 16)
    3965             :     {
    3966      327362 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    3967             :         // If we were sure that there would always be 3 trailing bytes, we could
    3968             :         // check against nIters - 15
    3969    28035300 :         for (; i < nIters - 16; i += 16)
    3970             :         {
    3971             :             __m128i xmm0 =
    3972    27707900 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3973             :             __m128i xmm1 =
    3974    27707900 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3975             :             __m128i xmm2 =
    3976    27707900 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3977             :             __m128i xmm3 =
    3978    55415800 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    3979             :             // Set higher 24bit of each int32 packed word to 0
    3980    27707900 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3981    27707900 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3982    27707900 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    3983    27707900 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    3984             :             // Pack int32 to int16
    3985    27707900 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    3986    27707900 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    3987             :             // Pack int16 to uint8
    3988    27707900 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    3989             : 
    3990             :             // Store result
    3991    27707900 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3992             : 
    3993    27707900 :             pSrc += 4 * 16;
    3994             :         }
    3995             :     }
    3996     5048700 :     for (; i < nIters; i++)
    3997             :     {
    3998     4716050 :         pDest[i] = *pSrc;
    3999     4716050 :         pSrc += 4;
    4000             :     }
    4001      332655 : }
    4002             : #endif  // HAVE_SSE2
    4003             : 
    4004             : /************************************************************************/
    4005             : /*                            GDALFastCopy()                            */
    4006             : /************************************************************************/
    4007             : 
    4008             : template <class T>
    4009    40101000 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    4010             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    4011             :                                 GPtrDiff_t nIters)
    4012             : {
    4013    40101000 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    4014    40101000 :     if (nIters == 1)
    4015             :     {
    4016    22540480 :         *pDest = *pSrc;
    4017             :     }
    4018    17560432 :     else if (nDestStride == sizeofT)
    4019             :     {
    4020    14486789 :         if (nSrcStride == sizeofT)
    4021             :         {
    4022    13397974 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    4023             :         }
    4024     1088847 :         else if (nSrcStride == 2 * sizeofT)
    4025             :         {
    4026      357409 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    4027             :         }
    4028      731438 :         else if (nSrcStride == 3 * sizeofT)
    4029             :         {
    4030      289245 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    4031             :         }
    4032      442193 :         else if (nSrcStride == 4 * sizeofT)
    4033             :         {
    4034      336637 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    4035             :         }
    4036             :         else
    4037             :         {
    4038    17229290 :             while (nIters-- > 0)
    4039             :             {
    4040    17123750 :                 *pDest = *pSrc;
    4041    17123750 :                 pSrc += nSrcStride / sizeofT;
    4042    17123750 :                 pDest++;
    4043             :             }
    4044             :         }
    4045             :     }
    4046     3073663 :     else if (nSrcStride == sizeofT)
    4047             :     {
    4048     3060667 :         if (nDestStride == 2 * sizeofT)
    4049             :         {
    4050      151252 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    4051             :         }
    4052     2909415 :         else if (nDestStride == 3 * sizeofT)
    4053             :         {
    4054     2131471 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    4055             :         }
    4056      777937 :         else if (nDestStride == 4 * sizeofT)
    4057             :         {
    4058      613625 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    4059             :         }
    4060             :         else
    4061             :         {
    4062    17169660 :             while (nIters-- > 0)
    4063             :             {
    4064    17005410 :                 *pDest = *pSrc;
    4065    17005410 :                 pSrc++;
    4066    17005410 :                 pDest += nDestStride / sizeofT;
    4067             :             }
    4068             :         }
    4069             :     }
    4070             :     else
    4071             :     {
    4072     1220108 :         while (nIters-- > 0)
    4073             :         {
    4074     1207102 :             *pDest = *pSrc;
    4075     1207102 :             pSrc += nSrcStride / sizeofT;
    4076     1207102 :             pDest += nDestStride / sizeofT;
    4077             :         }
    4078             :     }
    4079    40101000 : }
    4080             : 
    4081             : /************************************************************************/
    4082             : /*                          GDALFastCopyByte()                          */
    4083             : /************************************************************************/
    4084             : 
    4085      326320 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    4086             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    4087             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    4088             : {
    4089      326320 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    4090             :                  nWordCount);
    4091      326320 : }
    4092             : 
    4093             : /************************************************************************/
    4094             : /*                           GDALCopyWords()                            */
    4095             : /************************************************************************/
    4096             : 
    4097             : /**
    4098             :  * Copy pixel words from buffer to buffer.
    4099             :  *
    4100             :  * @see GDALCopyWords64()
    4101             :  */
    4102    80491000 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    4103             :                                GDALDataType eSrcType, int nSrcPixelStride,
    4104             :                                void *CPL_RESTRICT pDstData,
    4105             :                                GDALDataType eDstType, int nDstPixelStride,
    4106             :                                int nWordCount)
    4107             : {
    4108    80491000 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    4109             :                     nDstPixelStride, nWordCount);
    4110    80491000 : }
    4111             : 
    4112             : /************************************************************************/
    4113             : /*                          GDALCopyWords64()                           */
    4114             : /************************************************************************/
    4115             : 
    4116             : /**
    4117             :  * Copy pixel words from buffer to buffer.
    4118             :  *
    4119             :  * This function is used to copy pixel word values from one memory buffer
    4120             :  * to another, with support for conversion between data types, and differing
    4121             :  * step factors. The data type conversion is done using the following
    4122             :  * rules:
    4123             :  * <ul>
    4124             :  * <li>Values assigned to a lower range integer type are clipped. For
    4125             :  * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
    4126             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    4127             :  * </li>
    4128             :  * <li>
    4129             :  * Assignment from floating point to integer rounds to closest integer.
    4130             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    4131             :  * smallest integer. NaN is mapped to 0.
    4132             :  * </li>
    4133             :  * <li>
    4134             :  * Assignment from non-complex to complex will result in the imaginary part
    4135             :  * being set to zero on output.
    4136             :  * </li>
    4137             :  * <li> Assignment from complex to
    4138             :  * non-complex will result in the complex portion being lost and the real
    4139             :  * component being preserved (<i>not magnitude!</i>).
    4140             :  * </li>
    4141             :  * </ul>
    4142             :  *
    4143             :  * No assumptions are made about the source or destination words occurring
    4144             :  * on word boundaries.  It is assumed that all values are in native machine
    4145             :  * byte order.
    4146             :  *
    4147             :  * @param pSrcData Pointer to source data to be converted.
    4148             :  * @param eSrcType the source data type (see GDALDataType enum)
    4149             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    4150             :  * in bytes
    4151             :  * @param pDstData Pointer to buffer where destination data should go
    4152             :  * @param eDstType the destination data type (see GDALDataType enum)
    4153             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    4154             :  * words), in bytes
    4155             :  * @param nWordCount number of words to be copied
    4156             :  *
    4157             :  * @note
    4158             :  * When adding a new data type to GDAL, you must do the following to
    4159             :  * support it properly within the GDALCopyWords function:
    4160             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    4161             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    4162             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    4163             :  *    This should call the appropriate GDALCopyWordsT template.
    4164             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    4165             :  *    above namespace. This will ensure that any conversion issues are
    4166             :  *    handled (cases like the float -> int32 case, where the min/max)
    4167             :  *    values are subject to roundoff error.
    4168             :  */
    4169             : 
    4170   116774000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    4171             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    4172             :                                  void *CPL_RESTRICT pDstData,
    4173             :                                  GDALDataType eDstType, int nDstPixelStride,
    4174             :                                  GPtrDiff_t nWordCount)
    4175             : 
    4176             : {
    4177             :     // On platforms where alignment matters, be careful
    4178   116774000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    4179   116774000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    4180   116774000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    4181             :     {
    4182           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    4183             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    4184             :                  "argument");
    4185           2 :         return;
    4186             :     }
    4187   116774000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    4188    66322800 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    4189    66322800 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    4190    66322400 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    4191    66322300 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    4192             :     {
    4193         905 :         if (eSrcType == eDstType)
    4194             :         {
    4195       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4196             :             {
    4197       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4198             :                        static_cast<const GByte *>(pSrcData) +
    4199       34000 :                            nSrcPixelStride * i,
    4200             :                        nDstDataTypeSize);
    4201             :             }
    4202             :         }
    4203             :         else
    4204             :         {
    4205         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    4206             :             {
    4207             :                 return ptr +
    4208         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    4209         210 :                         align);
    4210             :             };
    4211             : 
    4212             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    4213             :             // be sure to get correctly aligned pointer.
    4214         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    4215             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    4216             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    4217             :             GByte *pabySrcBuffer =
    4218         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    4219             :             GByte *pabyDstBuffer =
    4220         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    4221        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4222             :             {
    4223        3255 :                 memcpy(pabySrcBuffer,
    4224             :                        static_cast<const GByte *>(pSrcData) +
    4225        3255 :                            nSrcPixelStride * i,
    4226             :                        nSrcDataTypeSize);
    4227        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    4228             :                                 eDstType, 0, 1);
    4229        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4230             :                        pabyDstBuffer, nDstDataTypeSize);
    4231             :             }
    4232             :         }
    4233         905 :         return;
    4234             :     }
    4235             : 
    4236             :     // Deal with the case where we're replicating a single word into the
    4237             :     // provided buffer
    4238   116773000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    4239             :     {
    4240     1068100 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    4241             :                           nDstPixelStride, nWordCount);
    4242     1068100 :         return;
    4243             :     }
    4244             : 
    4245   115705000 :     if (eSrcType == eDstType)
    4246             :     {
    4247    54673700 :         if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
    4248             :         {
    4249    17979000 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    4250             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    4251             :                          nWordCount);
    4252    17979000 :             return;
    4253             :         }
    4254             : 
    4255    36694700 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    4256    21795700 :             (nDstPixelStride % 2) == 0)
    4257             :         {
    4258    21795700 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    4259             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    4260             :                          nWordCount);
    4261    21795700 :             return;
    4262             :         }
    4263             : 
    4264    14899000 :         if (nWordCount == 1)
    4265             :         {
    4266             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    4267             :             // Avoid false positives...
    4268             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    4269             : #else
    4270    14411900 :             if (nSrcDataTypeSize == 2)
    4271           0 :                 memcpy(pDstData, pSrcData, 2);
    4272    14411900 :             else if (nSrcDataTypeSize == 4)
    4273    13807600 :                 memcpy(pDstData, pSrcData, 4);
    4274      604283 :             else if (nSrcDataTypeSize == 8)
    4275      587678 :                 memcpy(pDstData, pSrcData, 8);
    4276             :             else /* if( eSrcType == GDT_CFloat64 ) */
    4277       16605 :                 memcpy(pDstData, pSrcData, 16);
    4278             : #endif
    4279    14411900 :             return;
    4280             :         }
    4281             : 
    4282             :         // Let memcpy() handle the case where we're copying a packed buffer
    4283             :         // of pixels.
    4284      487145 :         if (nSrcPixelStride == nDstPixelStride)
    4285             :         {
    4286      225301 :             if (nSrcPixelStride == nSrcDataTypeSize)
    4287             :             {
    4288      225233 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    4289      225233 :                 return;
    4290             :             }
    4291             :         }
    4292             :     }
    4293             : 
    4294             :     // Handle the more general case -- deals with conversion of data types
    4295             :     // directly.
    4296    61292900 :     switch (eSrcType)
    4297             :     {
    4298    20306300 :         case GDT_UInt8:
    4299    20306300 :             GDALCopyWordsFromT<unsigned char>(
    4300             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    4301             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4302    20306300 :             break;
    4303        1786 :         case GDT_Int8:
    4304        1786 :             GDALCopyWordsFromT<signed char>(
    4305             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    4306             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4307        1786 :             break;
    4308       55311 :         case GDT_UInt16:
    4309       55311 :             GDALCopyWordsFromT<unsigned short>(
    4310             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    4311             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4312       55311 :             break;
    4313     6519830 :         case GDT_Int16:
    4314     6519830 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4315             :                                       nSrcPixelStride, false, pDstData,
    4316             :                                       eDstType, nDstPixelStride, nWordCount);
    4317     6519830 :             break;
    4318        8016 :         case GDT_UInt32:
    4319        8016 :             GDALCopyWordsFromT<unsigned int>(
    4320             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    4321             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4322        8016 :             break;
    4323    12254800 :         case GDT_Int32:
    4324    12254800 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4325             :                                     nSrcPixelStride, false, pDstData, eDstType,
    4326             :                                     nDstPixelStride, nWordCount);
    4327    12254800 :             break;
    4328        2205 :         case GDT_UInt64:
    4329        2205 :             GDALCopyWordsFromT<std::uint64_t>(
    4330             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    4331             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4332        2205 :             break;
    4333       11729 :         case GDT_Int64:
    4334       11729 :             GDALCopyWordsFromT<std::int64_t>(
    4335             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    4336             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4337       11729 :             break;
    4338        1387 :         case GDT_Float16:
    4339        1387 :             GDALCopyWordsFromT<GFloat16>(
    4340             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
    4341             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4342        1387 :             break;
    4343      654936 :         case GDT_Float32:
    4344      654936 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4345             :                                       nSrcPixelStride, false, pDstData,
    4346             :                                       eDstType, nDstPixelStride, nWordCount);
    4347      654936 :             break;
    4348    20715800 :         case GDT_Float64:
    4349    20715800 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4350             :                                        nSrcPixelStride, false, pDstData,
    4351             :                                        eDstType, nDstPixelStride, nWordCount);
    4352    20715800 :             break;
    4353      478486 :         case GDT_CInt16:
    4354      478486 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4355             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4356             :                                       nDstPixelStride, nWordCount);
    4357      478486 :             break;
    4358         868 :         case GDT_CInt32:
    4359         868 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4360             :                                     nSrcPixelStride, true, pDstData, eDstType,
    4361             :                                     nDstPixelStride, nWordCount);
    4362         868 :             break;
    4363         508 :         case GDT_CFloat16:
    4364         508 :             GDALCopyWordsFromT<GFloat16>(
    4365             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
    4366             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4367         508 :             break;
    4368        2437 :         case GDT_CFloat32:
    4369        2437 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4370             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4371             :                                       nDstPixelStride, nWordCount);
    4372        2437 :             break;
    4373      278520 :         case GDT_CFloat64:
    4374      278520 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4375             :                                        nSrcPixelStride, true, pDstData,
    4376             :                                        eDstType, nDstPixelStride, nWordCount);
    4377      278520 :             break;
    4378           0 :         case GDT_Unknown:
    4379             :         case GDT_TypeCount:
    4380           0 :             CPLAssert(false);
    4381             :     }
    4382             : }
    4383             : 
    4384             : /************************************************************************/
    4385             : /*                            GDALCopyBits()                            */
    4386             : /************************************************************************/
    4387             : 
    4388             : /**
    4389             :  * Bitwise word copying.
    4390             :  *
    4391             :  * A function for moving sets of partial bytes around.  Loosely
    4392             :  * speaking this is a bitwise analog to GDALCopyWords().
    4393             :  *
    4394             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    4395             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    4396             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    4397             :  * and nDstOffset are the offset into the source and destination buffers
    4398             :  * to start at, also measured in bits.
    4399             :  *
    4400             :  * All bit offsets are assumed to start from the high order bit in a byte
    4401             :  * (i.e. most significant bit first).  Currently this function is not very
    4402             :  * optimized, but it may be improved for some common cases in the future
    4403             :  * as needed.
    4404             :  *
    4405             :  * @param pabySrcData the source data buffer.
    4406             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    4407             :  * first word to copy.
    4408             :  * @param nSrcStep the offset in bits from the start one source word to the
    4409             :  * start of the next.
    4410             :  * @param pabyDstData the destination data buffer.
    4411             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    4412             :  * first word to copy over.
    4413             :  * @param nDstStep the offset in bits from the start one word to the
    4414             :  * start of the next.
    4415             :  * @param nBitCount the number of bits in a word to be copied.
    4416             :  * @param nStepCount the number of words to copy.
    4417             :  */
    4418             : 
    4419           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    4420             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    4421             :                   int nBitCount, int nStepCount)
    4422             : 
    4423             : {
    4424           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    4425             : 
    4426           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    4427             :     {
    4428           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    4429             :         {
    4430           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    4431           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    4432             :             else
    4433           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    4434             : 
    4435           0 :             nSrcOffset++;
    4436           0 :             nDstOffset++;
    4437             :         }
    4438             : 
    4439           0 :         nSrcOffset += (nSrcStep - nBitCount);
    4440           0 :         nDstOffset += (nDstStep - nBitCount);
    4441             :     }
    4442             : }
    4443             : 
    4444             : /************************************************************************/
    4445             : /*                    GDALGetBestOverviewLevel()                        */
    4446             : /*                                                                      */
    4447             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4448             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    4449             : /* returning a valid overview level                                     */
    4450             : /************************************************************************/
    4451             : 
    4452           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    4453             :                                  int &nXSize, int &nYSize, int nBufXSize,
    4454             :                                  int nBufYSize)
    4455             : {
    4456           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    4457           0 :                                          nBufXSize, nBufYSize, nullptr);
    4458             : }
    4459             : 
    4460      524017 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    4461             :                                   int &nYOff, int &nXSize, int &nYSize,
    4462             :                                   int nBufXSize, int nBufYSize,
    4463             :                                   GDALRasterIOExtraArg *psExtraArg)
    4464             : {
    4465      524017 :     if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
    4466      524017 :         psExtraArg->bUseOnlyThisScale)
    4467         109 :         return -1;
    4468             :     /* -------------------------------------------------------------------- */
    4469             :     /*      Compute the desired downsampling factor.  It is                 */
    4470             :     /*      based on the least reduced axis, and represents the number      */
    4471             :     /*      of source pixels to one destination pixel.                      */
    4472             :     /* -------------------------------------------------------------------- */
    4473      523908 :     const double dfDesiredDownsamplingFactor =
    4474      523908 :         ((nXSize / static_cast<double>(nBufXSize)) <
    4475      361568 :              (nYSize / static_cast<double>(nBufYSize)) ||
    4476             :          nBufYSize == 1)
    4477      752297 :             ? nXSize / static_cast<double>(nBufXSize)
    4478      133179 :             : nYSize / static_cast<double>(nBufYSize);
    4479             : 
    4480             :     /* -------------------------------------------------------------------- */
    4481             :     /*      Find the overview level that largest downsampling factor (most  */
    4482             :     /*      downsampled) that is still less than (or only a little more)    */
    4483             :     /*      downsampled than the request.                                   */
    4484             :     /* -------------------------------------------------------------------- */
    4485      523908 :     const int nOverviewCount = poBand->GetOverviewCount();
    4486      523908 :     GDALRasterBand *poBestOverview = nullptr;
    4487      523908 :     double dfBestDownsamplingFactor = 0;
    4488      523908 :     int nBestOverviewLevel = -1;
    4489             : 
    4490             :     const char *pszOversampligThreshold =
    4491      523908 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    4492             : 
    4493             :     // Note: keep this logic for overview selection in sync between
    4494             :     // gdalwarp_lib.cpp and rasterio.cpp
    4495             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    4496             :     const double dfOversamplingThreshold =
    4497     1047810 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    4498      523899 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    4499     1047800 :             ? 1.0
    4500      523908 :             : 1.2;
    4501      526604 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4502             :     {
    4503        5616 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    4504       11232 :         if (poOverview == nullptr ||
    4505       11231 :             poOverview->GetXSize() > poBand->GetXSize() ||
    4506        5615 :             poOverview->GetYSize() > poBand->GetYSize())
    4507             :         {
    4508           1 :             continue;
    4509             :         }
    4510             : 
    4511             :         // Compute downsampling factor of this overview
    4512             :         const double dfDownsamplingFactor = std::min(
    4513        5615 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    4514       11230 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    4515             : 
    4516             :         // Is it nearly the requested factor and better (lower) than
    4517             :         // the current best factor?
    4518             :         // Use an epsilon because of numerical instability.
    4519        5615 :         constexpr double EPSILON = 1e-1;
    4520        5723 :         if (dfDownsamplingFactor >=
    4521        5615 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold +
    4522        5507 :                     EPSILON ||
    4523             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    4524             :         {
    4525         108 :             continue;
    4526             :         }
    4527             : 
    4528             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    4529        5507 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    4530             : 
    4531        5507 :         if (pszResampling != nullptr &&
    4532          71 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    4533          16 :             continue;
    4534             : 
    4535             :         // OK, this is our new best overview.
    4536        5491 :         poBestOverview = poOverview;
    4537        5491 :         nBestOverviewLevel = iOverview;
    4538        5491 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    4539             : 
    4540        5491 :         if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
    4541             :             EPSILON)
    4542             :         {
    4543        2920 :             break;
    4544             :         }
    4545             :     }
    4546             : 
    4547             :     /* -------------------------------------------------------------------- */
    4548             :     /*      If we didn't find an overview that helps us, just return        */
    4549             :     /*      indicating failure and the full resolution image will be used.  */
    4550             :     /* -------------------------------------------------------------------- */
    4551      523908 :     if (nBestOverviewLevel < 0)
    4552      520915 :         return -1;
    4553             : 
    4554             :     /* -------------------------------------------------------------------- */
    4555             :     /*      Recompute the source window in terms of the selected            */
    4556             :     /*      overview.                                                       */
    4557             :     /* -------------------------------------------------------------------- */
    4558             :     const double dfXFactor =
    4559        2993 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    4560             :     const double dfYFactor =
    4561        2993 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    4562        2993 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    4563             :              poBestOverview->GetYSize());
    4564             : 
    4565        8979 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    4566        2993 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    4567        8979 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    4568        2993 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    4569        2993 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    4570        2993 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    4571        2993 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    4572           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    4573        2993 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    4574           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    4575             : 
    4576        2993 :     if (psExtraArg)
    4577             :     {
    4578        2993 :         if (psExtraArg->bFloatingPointWindowValidity)
    4579             :         {
    4580         117 :             psExtraArg->dfXOff /= dfXFactor;
    4581         117 :             psExtraArg->dfXSize /= dfXFactor;
    4582         117 :             psExtraArg->dfYOff /= dfYFactor;
    4583         117 :             psExtraArg->dfYSize /= dfYFactor;
    4584             :         }
    4585        2876 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    4586             :         {
    4587          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    4588          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    4589          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    4590          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    4591          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    4592             :         }
    4593             :     }
    4594             : 
    4595        2993 :     nXOff = nOXOff;
    4596        2993 :     nYOff = nOYOff;
    4597        2993 :     nXSize = nOXSize;
    4598        2993 :     nYSize = nOYSize;
    4599             : 
    4600        2993 :     return nBestOverviewLevel;
    4601             : }
    4602             : 
    4603             : /************************************************************************/
    4604             : /*                          OverviewRasterIO()                          */
    4605             : /*                                                                      */
    4606             : /*      Special work function to utilize available overviews to         */
    4607             : /*      more efficiently satisfy downsampled requests.  It will         */
    4608             : /*      return CE_Failure if there are no appropriate overviews         */
    4609             : /*      available but it doesn't emit any error messages.               */
    4610             : /************************************************************************/
    4611             : 
    4612             : //! @cond Doxygen_Suppress
    4613           2 : CPLErr GDALRasterBand::OverviewRasterIO(
    4614             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4615             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4616             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    4617             : 
    4618             : {
    4619             :     GDALRasterIOExtraArg sExtraArg;
    4620           2 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4621             : 
    4622           2 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    4623             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    4624           2 :     if (nOverview < 0)
    4625           1 :         return CE_Failure;
    4626             : 
    4627             :     /* -------------------------------------------------------------------- */
    4628             :     /*      Recast the call in terms of the new raster layer.               */
    4629             :     /* -------------------------------------------------------------------- */
    4630           1 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    4631           1 :     if (poOverviewBand == nullptr)
    4632           0 :         return CE_Failure;
    4633             : 
    4634           1 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4635             :                                     pData, nBufXSize, nBufYSize, eBufType,
    4636           1 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    4637             : }
    4638             : 
    4639             : /************************************************************************/
    4640             : /*                        TryOverviewRasterIO()                         */
    4641             : /************************************************************************/
    4642             : 
    4643      362428 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    4644             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4645             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4646             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    4647             :     int *pbTried)
    4648             : {
    4649      362428 :     int nXOffMod = nXOff;
    4650      362428 :     int nYOffMod = nYOff;
    4651      362428 :     int nXSizeMod = nXSize;
    4652      362428 :     int nYSizeMod = nYSize;
    4653             :     GDALRasterIOExtraArg sExtraArg;
    4654             : 
    4655      362428 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4656             : 
    4657      362428 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4658             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    4659             :         &sExtraArg);
    4660             : 
    4661      362428 :     if (iOvrLevel >= 0)
    4662             :     {
    4663          53 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    4664          53 :         if (poOverviewBand)
    4665             :         {
    4666          53 :             *pbTried = TRUE;
    4667          53 :             return poOverviewBand->RasterIO(
    4668             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    4669             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    4670          53 :                 &sExtraArg);
    4671             :         }
    4672             :     }
    4673             : 
    4674      362375 :     *pbTried = FALSE;
    4675      362375 :     return CE_None;
    4676             : }
    4677             : 
    4678             : /************************************************************************/
    4679             : /*                        TryOverviewRasterIO()                         */
    4680             : /************************************************************************/
    4681             : 
    4682      158613 : CPLErr GDALDataset::TryOverviewRasterIO(
    4683             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4684             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4685             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4686             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    4687             :     int *pbTried)
    4688             : {
    4689      158613 :     int nXOffMod = nXOff;
    4690      158613 :     int nYOffMod = nYOff;
    4691      158613 :     int nXSizeMod = nXSize;
    4692      158613 :     int nYSizeMod = nYSize;
    4693             :     GDALRasterIOExtraArg sExtraArg;
    4694      158613 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4695             : 
    4696      317226 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4697      158613 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    4698             :         nBufYSize, &sExtraArg);
    4699             : 
    4700      158655 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    4701          42 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    4702             :     {
    4703          42 :         *pbTried = TRUE;
    4704          42 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    4705             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    4706             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    4707          42 :             nLineSpace, nBandSpace, &sExtraArg);
    4708             :     }
    4709             :     else
    4710             :     {
    4711      158571 :         *pbTried = FALSE;
    4712      158571 :         return CE_None;
    4713             :     }
    4714             : }
    4715             : 
    4716             : /************************************************************************/
    4717             : /*                        GetBestOverviewLevel()                        */
    4718             : /*                                                                      */
    4719             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4720             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    4721             : /* overview level                                                       */
    4722             : /************************************************************************/
    4723             : 
    4724           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    4725             :                                            int &nYOff, int &nXSize, int &nYSize,
    4726             :                                            int nBufXSize, int nBufYSize,
    4727             :                                            int nBandCount,
    4728             :                                            const int *panBandMap,
    4729             :                                            GDALRasterIOExtraArg *psExtraArg)
    4730             : {
    4731           4 :     int nOverviewCount = 0;
    4732           4 :     GDALRasterBand *poFirstBand = nullptr;
    4733             : 
    4734             :     /* -------------------------------------------------------------------- */
    4735             :     /* Check that all bands have the same number of overviews and           */
    4736             :     /* that they have all the same size and block dimensions                */
    4737             :     /* -------------------------------------------------------------------- */
    4738          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4739             :     {
    4740           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    4741           8 :         if (poBand == nullptr)
    4742           0 :             return -1;
    4743           8 :         if (iBand == 0)
    4744             :         {
    4745           4 :             poFirstBand = poBand;
    4746           4 :             nOverviewCount = poBand->GetOverviewCount();
    4747             :         }
    4748           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    4749             :         {
    4750           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4751             :                              "mismatched overview count, use std method.");
    4752           0 :             return -1;
    4753             :         }
    4754             :         else
    4755             :         {
    4756           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4757             :             {
    4758           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    4759             :                 GDALRasterBand *poOvrFirstBand =
    4760           0 :                     poFirstBand->GetOverview(iOverview);
    4761           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    4762           0 :                     continue;
    4763             : 
    4764           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    4765           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    4766             :                 {
    4767           0 :                     CPLDebug("GDAL",
    4768             :                              "GDALDataset::GetBestOverviewLevel() ... "
    4769             :                              "mismatched overview sizes, use std method.");
    4770           0 :                     return -1;
    4771             :                 }
    4772           0 :                 int nBlockXSizeFirst = 0;
    4773           0 :                 int nBlockYSizeFirst = 0;
    4774           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    4775             :                                              &nBlockYSizeFirst);
    4776             : 
    4777           0 :                 int nBlockXSizeCurrent = 0;
    4778           0 :                 int nBlockYSizeCurrent = 0;
    4779           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    4780             :                                         &nBlockYSizeCurrent);
    4781             : 
    4782           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    4783           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    4784             :                 {
    4785           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4786             :                                      "mismatched block sizes, use std method.");
    4787           0 :                     return -1;
    4788             :                 }
    4789             :             }
    4790             :         }
    4791             :     }
    4792           4 :     if (poFirstBand == nullptr)
    4793           0 :         return -1;
    4794             : 
    4795           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    4796             :                                          nYSize, nBufXSize, nBufYSize,
    4797           4 :                                          psExtraArg);
    4798             : }
    4799             : 
    4800             : /************************************************************************/
    4801             : /*                         BlockBasedRasterIO()                         */
    4802             : /*                                                                      */
    4803             : /*      This convenience function implements a dataset level            */
    4804             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    4805             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    4806             : /*      all bands at once, so that a format driver that handles a       */
    4807             : /*      request for different bands of the same block efficiently       */
    4808             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    4809             : /*                                                                      */
    4810             : /*      This method is intended to be called by an overridden           */
    4811             : /*      IRasterIO() method in the driver specific GDALDataset           */
    4812             : /*      derived class.                                                  */
    4813             : /*                                                                      */
    4814             : /*      Default internal implementation of RasterIO() ... utilizes      */
    4815             : /*      the Block access methods to satisfy the request.  This would    */
    4816             : /*      normally only be overridden by formats with overviews.          */
    4817             : /*                                                                      */
    4818             : /*      To keep things relatively simple, this method does not          */
    4819             : /*      currently take advantage of some special cases addressed in     */
    4820             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    4821             : /*      call it when you know it will help.  That is in cases where     */
    4822             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    4823             : /*      implementing interleaved IO efficiently on a block by block     */
    4824             : /*      basis. Overviews will be used when possible.                    */
    4825             : /************************************************************************/
    4826             : 
    4827       64982 : CPLErr GDALDataset::BlockBasedRasterIO(
    4828             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4829             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4830             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4831             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    4832             : 
    4833             : {
    4834       64982 :     CPLAssert(nullptr != pData);
    4835             : 
    4836       64982 :     GByte **papabySrcBlock = nullptr;
    4837       64982 :     GDALRasterBlock *poBlock = nullptr;
    4838       64982 :     GDALRasterBlock **papoBlocks = nullptr;
    4839       64982 :     int nLBlockX = -1;
    4840       64982 :     int nLBlockY = -1;
    4841             :     int iBufYOff;
    4842             :     int iBufXOff;
    4843       64982 :     int nBlockXSize = 1;
    4844       64982 :     int nBlockYSize = 1;
    4845       64982 :     CPLErr eErr = CE_None;
    4846       64982 :     GDALDataType eDataType = GDT_UInt8;
    4847             : 
    4848       64982 :     const bool bUseIntegerRequestCoords =
    4849       65020 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4850          38 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4851          36 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4852             : 
    4853             :     /* -------------------------------------------------------------------- */
    4854             :     /*      Ensure that all bands share a common block size and data type.  */
    4855             :     /* -------------------------------------------------------------------- */
    4856      308187 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4857             :     {
    4858      243205 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4859             : 
    4860      243205 :         if (iBand == 0)
    4861             :         {
    4862       64982 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4863       64982 :             eDataType = poBand->GetRasterDataType();
    4864             :         }
    4865             :         else
    4866             :         {
    4867      178223 :             int nThisBlockXSize = 0;
    4868      178223 :             int nThisBlockYSize = 0;
    4869      178223 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    4870      178223 :             if (nThisBlockXSize != nBlockXSize ||
    4871      178223 :                 nThisBlockYSize != nBlockYSize)
    4872             :             {
    4873           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4874             :                                  "mismatched block sizes, use std method.");
    4875           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4876             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4877             :                                          nBandCount, panBandMap, nPixelSpace,
    4878           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4879             :             }
    4880             : 
    4881      178223 :             if (eDataType != poBand->GetRasterDataType() &&
    4882           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    4883             :             {
    4884           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4885             :                                  "mismatched band data types, use std method.");
    4886           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4887             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4888             :                                          nBandCount, panBandMap, nPixelSpace,
    4889           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4890             :             }
    4891             :         }
    4892             :     }
    4893             : 
    4894             :     /* ==================================================================== */
    4895             :     /*      In this special case at full resolution we step through in      */
    4896             :     /*      blocks, turning the request over to the per-band                */
    4897             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4898             :     /*      called before proceeding to the next.                           */
    4899             :     /* ==================================================================== */
    4900             : 
    4901       64982 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4902             :     {
    4903             :         GDALRasterIOExtraArg sDummyExtraArg;
    4904       64978 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4905             : 
    4906       64978 :         int nChunkYSize = 0;
    4907       64978 :         int nChunkXSize = 0;
    4908             : 
    4909      213434 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4910             :         {
    4911      149472 :             const int nChunkYOff = iBufYOff + nYOff;
    4912      149472 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4913      149472 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4914       59977 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4915             : 
    4916      822752 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4917             :             {
    4918      674295 :                 const int nChunkXOff = iBufXOff + nXOff;
    4919      674295 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4920      674295 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4921       70691 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4922             : 
    4923      674295 :                 GByte *pabyChunkData =
    4924      674295 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4925      674295 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4926             : 
    4927     3282490 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4928             :                 {
    4929     2609210 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4930             : 
    4931     5218420 :                     eErr = poBand->IRasterIO(
    4932             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4933             :                         nChunkYSize,
    4934     2609210 :                         pabyChunkData +
    4935     2609210 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4936             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4937     2609210 :                         nLineSpace, &sDummyExtraArg);
    4938     2609210 :                     if (eErr != CE_None)
    4939        1015 :                         return eErr;
    4940             :                 }
    4941             :             }
    4942             : 
    4943      167371 :             if (psExtraArg->pfnProgress != nullptr &&
    4944       18914 :                 !psExtraArg->pfnProgress(
    4945      167371 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4946             :                         nBufYSize,
    4947             :                     "", psExtraArg->pProgressData))
    4948             :             {
    4949           1 :                 return CE_Failure;
    4950             :             }
    4951             :         }
    4952             : 
    4953       63962 :         return CE_None;
    4954             :     }
    4955             : 
    4956             :     /* Below code is not compatible with that case. It would need a complete */
    4957             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    4958           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    4959             :     {
    4960           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4961             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4962             :                                  panBandMap, nPixelSpace, nLineSpace,
    4963           0 :                                  nBandSpace, psExtraArg);
    4964             :     }
    4965             : 
    4966             :     /* We could have a smarter implementation, but that will do for now */
    4967           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    4968           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    4969             :     {
    4970           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4971             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4972             :                                  panBandMap, nPixelSpace, nLineSpace,
    4973           0 :                                  nBandSpace, psExtraArg);
    4974             :     }
    4975             : 
    4976             :     /* ==================================================================== */
    4977             :     /*      Loop reading required source blocks to satisfy output           */
    4978             :     /*      request.  This is the most general implementation.              */
    4979             :     /* ==================================================================== */
    4980             : 
    4981           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    4982             : 
    4983             :     papabySrcBlock =
    4984           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    4985             :     papoBlocks =
    4986           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    4987             : 
    4988             :     /* -------------------------------------------------------------------- */
    4989             :     /*      Select an overview level if appropriate.                        */
    4990             :     /* -------------------------------------------------------------------- */
    4991             : 
    4992             :     GDALRasterIOExtraArg sExtraArg;
    4993           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4994           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    4995             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    4996             :         panBandMap, &sExtraArg);
    4997           4 :     if (nOverviewLevel >= 0)
    4998             :     {
    4999           2 :         GetRasterBand(panBandMap[0])
    5000           2 :             ->GetOverview(nOverviewLevel)
    5001           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    5002             :     }
    5003             : 
    5004           4 :     double dfXOff = nXOff;
    5005           4 :     double dfYOff = nYOff;
    5006           4 :     double dfXSize = nXSize;
    5007           4 :     double dfYSize = nYSize;
    5008           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    5009             :     {
    5010           2 :         dfXOff = sExtraArg.dfXOff;
    5011           2 :         dfYOff = sExtraArg.dfYOff;
    5012           2 :         dfXSize = sExtraArg.dfXSize;
    5013           2 :         dfYSize = sExtraArg.dfYSize;
    5014             :     }
    5015             : 
    5016             :     /* -------------------------------------------------------------------- */
    5017             :     /*      Compute stepping increment.                                     */
    5018             :     /* -------------------------------------------------------------------- */
    5019           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    5020           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    5021             : 
    5022           4 :     constexpr double EPS = 1e-10;
    5023             :     /* -------------------------------------------------------------------- */
    5024             :     /*      Loop over buffer computing source locations.                    */
    5025             :     /* -------------------------------------------------------------------- */
    5026          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    5027             :     {
    5028             :         GPtrDiff_t iSrcOffset;
    5029             : 
    5030             :         // Add small epsilon to avoid some numeric precision issues.
    5031          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    5032          32 :         const int iSrcY = static_cast<int>(std::min(
    5033          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    5034             : 
    5035          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    5036             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    5037             : 
    5038         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    5039             :         {
    5040         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    5041         270 :             const int iSrcX = static_cast<int>(std::min(
    5042         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    5043             : 
    5044             :             // FIXME: this code likely doesn't work if the dirty block gets
    5045             :             // flushed to disk before being completely written. In the meantime,
    5046             :             // bJustInitialize should probably be set to FALSE even if it is not
    5047             :             // ideal performance wise, and for lossy compression
    5048             : 
    5049             :             /* --------------------------------------------------------------------
    5050             :              */
    5051             :             /*      Ensure we have the appropriate block loaded. */
    5052             :             /* --------------------------------------------------------------------
    5053             :              */
    5054         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    5055         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    5056         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    5057         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    5058             :             {
    5059           4 :                 nLBlockX = iSrcX / nBlockXSize;
    5060           4 :                 nLBlockY = iSrcY / nBlockYSize;
    5061             : 
    5062           4 :                 const bool bJustInitialize =
    5063           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    5064           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    5065           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    5066           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    5067             :                 /*bool bMemZeroBuffer = FALSE;
    5068             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    5069             :                     nXOff <= nLBlockX * nBlockXSize &&
    5070             :                     nYOff <= nLBlockY * nBlockYSize &&
    5071             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    5072             :                      (nXOff + nXSize == GetRasterXSize() &&
    5073             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    5074             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    5075             :                      (nYOff + nYSize == GetRasterYSize() &&
    5076             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    5077             :                 {
    5078             :                     bJustInitialize = TRUE;
    5079             :                     bMemZeroBuffer = TRUE;
    5080             :                 }*/
    5081          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    5082             :                 {
    5083           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    5084           8 :                     if (nOverviewLevel >= 0)
    5085           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    5086          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    5087           8 :                                                         bJustInitialize);
    5088           8 :                     if (poBlock == nullptr)
    5089             :                     {
    5090           0 :                         eErr = CE_Failure;
    5091           0 :                         goto CleanupAndReturn;
    5092             :                     }
    5093             : 
    5094           8 :                     if (eRWFlag == GF_Write)
    5095           0 :                         poBlock->MarkDirty();
    5096             : 
    5097           8 :                     if (papoBlocks[iBand] != nullptr)
    5098           0 :                         papoBlocks[iBand]->DropLock();
    5099             : 
    5100           8 :                     papoBlocks[iBand] = poBlock;
    5101             : 
    5102           8 :                     papabySrcBlock[iBand] =
    5103           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    5104             :                     /*if( bMemZeroBuffer )
    5105             :                     {
    5106             :                         memset(papabySrcBlock[iBand], 0,
    5107             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    5108             :                     * nBlockYSize);
    5109             :                     }*/
    5110             :                 }
    5111             :             }
    5112             : 
    5113             :             /* --------------------------------------------------------------------
    5114             :              */
    5115             :             /*      Copy over this pixel of data. */
    5116             :             /* --------------------------------------------------------------------
    5117             :              */
    5118         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    5119         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    5120         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    5121         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    5122         270 :                               nBlockXSize) *
    5123         270 :                          nBandDataSize;
    5124             : 
    5125         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    5126             :             {
    5127         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    5128         710 :                 GPtrDiff_t iBandBufOffset =
    5129         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    5130             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    5131             : 
    5132         710 :                 if (eDataType == eBufType)
    5133             :                 {
    5134         710 :                     if (eRWFlag == GF_Read)
    5135         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    5136         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    5137             :                     else
    5138           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    5139             :                                static_cast<const GByte *>(pData) +
    5140           0 :                                    iBandBufOffset,
    5141             :                                nBandDataSize);
    5142             :                 }
    5143             :                 else
    5144             :                 {
    5145             :                     /* type to type conversion ... ouch, this is expensive way
    5146             :                        of handling single words */
    5147             : 
    5148           0 :                     if (eRWFlag == GF_Read)
    5149           0 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
    5150             :                                         static_cast<GByte *>(pData) +
    5151           0 :                                             iBandBufOffset,
    5152             :                                         eBufType, 0, 1);
    5153             :                     else
    5154           0 :                         GDALCopyWords64(static_cast<const GByte *>(pData) +
    5155           0 :                                             iBandBufOffset,
    5156           0 :                                         eBufType, 0, pabySrcBlock + iSrcOffset,
    5157             :                                         eDataType, 0, 1);
    5158             :                 }
    5159             :             }
    5160             : 
    5161         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    5162             :         }
    5163             :     }
    5164             : 
    5165             :     /* -------------------------------------------------------------------- */
    5166             :     /*      CleanupAndReturn.                                               */
    5167             :     /* -------------------------------------------------------------------- */
    5168           4 : CleanupAndReturn:
    5169           4 :     CPLFree(papabySrcBlock);
    5170           4 :     if (papoBlocks != nullptr)
    5171             :     {
    5172          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    5173             :         {
    5174           8 :             if (papoBlocks[iBand] != nullptr)
    5175           8 :                 papoBlocks[iBand]->DropLock();
    5176             :         }
    5177           4 :         CPLFree(papoBlocks);
    5178             :     }
    5179             : 
    5180           4 :     return eErr;
    5181             : }
    5182             : 
    5183             : //! @endcond
    5184             : 
    5185             : /************************************************************************/
    5186             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    5187             : /************************************************************************/
    5188             : 
    5189        3375 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    5190             :                                             GDALRasterBand *poDstPrototypeBand,
    5191             :                                             int nBandCount,
    5192             :                                             int bDstIsCompressed,
    5193             :                                             int bInterleave, int *pnSwathCols,
    5194             :                                             int *pnSwathLines)
    5195             : {
    5196        3375 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5197        3375 :     int nSrcBlockXSize = 0;
    5198        3375 :     int nSrcBlockYSize = 0;
    5199        3375 :     int nBlockXSize = 0;
    5200        3375 :     int nBlockYSize = 0;
    5201             : 
    5202        3375 :     int nXSize = poSrcPrototypeBand->GetXSize();
    5203        3375 :     int nYSize = poSrcPrototypeBand->GetYSize();
    5204             : 
    5205        3375 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    5206        3375 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    5207             : 
    5208        3375 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    5209        3375 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    5210             : 
    5211        3375 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5212        3375 :     if (bInterleave)
    5213         583 :         nPixelSize *= nBandCount;
    5214             : 
    5215             :     // aim for one row of blocks.  Do not settle for less.
    5216        3375 :     int nSwathCols = nXSize;
    5217        3375 :     int nSwathLines = nMaxBlockYSize;
    5218             : 
    5219             :     const char *pszSrcCompression =
    5220        3375 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    5221        3375 :     if (pszSrcCompression == nullptr)
    5222             :     {
    5223        3355 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    5224        3355 :         if (poSrcDS)
    5225             :             pszSrcCompression =
    5226        3349 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    5227             :     }
    5228             : 
    5229             :     /* -------------------------------------------------------------------- */
    5230             :     /*      What will our swath size be?                                    */
    5231             :     /* -------------------------------------------------------------------- */
    5232             :     // When writing interleaved data in a compressed format, we want to be sure
    5233             :     // that each block will only be written once, so the swath size must not be
    5234             :     // greater than the block cache.
    5235        3375 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    5236             :     int nTargetSwathSize;
    5237        3375 :     if (pszSwathSize != nullptr)
    5238           0 :         nTargetSwathSize = static_cast<int>(
    5239           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    5240             :     else
    5241             :     {
    5242             :         // As a default, take one 1/4 of the cache size.
    5243        3375 :         nTargetSwathSize = static_cast<int>(
    5244        3375 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    5245             : 
    5246             :         // but if the minimum idal swath buf size is less, then go for it to
    5247             :         // avoid unnecessarily abusing RAM usage.
    5248             :         // but try to use 10 MB at least.
    5249        3375 :         GIntBig nIdealSwathBufSize =
    5250        3375 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    5251        3375 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    5252             : 
    5253        3375 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    5254        3375 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    5255             :         {
    5256           1 :             nMinTargetSwathSize = nTargetSwathSize;
    5257             :         }
    5258             : 
    5259        3375 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    5260        3365 :             nIdealSwathBufSize < nMinTargetSwathSize)
    5261             :         {
    5262        3362 :             nIdealSwathBufSize = nMinTargetSwathSize;
    5263             :         }
    5264             : 
    5265        3375 :         if (pszSrcCompression != nullptr &&
    5266         181 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    5267           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    5268           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    5269             :         {
    5270           2 :             nIdealSwathBufSize =
    5271           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    5272           2 :                                                  nSrcBlockYSize * nPixelSize);
    5273             :         }
    5274        3375 :         if (nTargetSwathSize > nIdealSwathBufSize)
    5275        3362 :             nTargetSwathSize = static_cast<int>(
    5276        3362 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    5277             :     }
    5278             : 
    5279        3375 :     if (nTargetSwathSize < 1000000)
    5280           8 :         nTargetSwathSize = 1000000;
    5281             : 
    5282             :     /* But let's check that  */
    5283        3596 :     if (bDstIsCompressed && bInterleave &&
    5284         221 :         nTargetSwathSize > GDALGetCacheMax64())
    5285             :     {
    5286           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    5287             :                  "When translating into a compressed interleave format, "
    5288             :                  "the block cache size (" CPL_FRMT_GIB ") "
    5289             :                  "should be at least the size of the swath (%d) "
    5290             :                  "(GDAL_SWATH_SIZE config. option)",
    5291             :                  GDALGetCacheMax64(), nTargetSwathSize);
    5292             :     }
    5293             : 
    5294             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    5295             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    5296             : 
    5297             :     // if both input and output datasets are tiled, that the tile dimensions
    5298             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    5299             :     // of input and output block dimensions.
    5300        3375 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    5301          47 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    5302          47 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    5303          47 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5304          47 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5305             :     {
    5306          47 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    5307          47 :                 nPixelSize <=
    5308          47 :             static_cast<GIntBig>(nTargetSwathSize))
    5309             :         {
    5310          47 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    5311          47 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    5312          47 :             if (nSwathCols == 0)
    5313           0 :                 nSwathCols = nMaxBlockXSize;
    5314          47 :             if (nSwathCols > nXSize)
    5315          45 :                 nSwathCols = nXSize;
    5316          47 :             nSwathLines = nMaxBlockYSize;
    5317             : 
    5318          47 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    5319          47 :                 static_cast<GIntBig>(nTargetSwathSize))
    5320             :             {
    5321           0 :                 nSwathCols = nXSize;
    5322           0 :                 nSwathLines = nBlockYSize;
    5323             :             }
    5324             :         }
    5325             :     }
    5326             : 
    5327        3375 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    5328        3375 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    5329        3375 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    5330             :     {
    5331           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    5332           1 :         if (nSwathLines == 0)
    5333           1 :             nSwathLines = 1;
    5334             : 
    5335           1 :         CPLDebug(
    5336             :             "GDAL",
    5337             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    5338             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    5339             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    5340           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    5341             :     }
    5342             :     // If we are processing single scans, try to handle several at once.
    5343             :     // If we are handling swaths already, only grow the swath if a row
    5344             :     // of blocks is substantially less than our target buffer size.
    5345        3374 :     else if (nSwathLines == 1 ||
    5346        2823 :              nMemoryPerCol * nSwathLines <
    5347        2823 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    5348             :     {
    5349        3346 :         nSwathLines = std::min(
    5350             :             nYSize,
    5351        3346 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    5352             : 
    5353             :         /* If possible try to align to source and target block height */
    5354        3346 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    5355         273 :             nSwathLines > nMaxBlockYSize &&
    5356         273 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5357         244 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5358         217 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    5359             :     }
    5360             : 
    5361        3375 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    5362           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    5363           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    5364             :     {
    5365             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    5366           2 :         if (nSwathLines < nSrcBlockYSize)
    5367             :         {
    5368           0 :             nSwathLines = nSrcBlockYSize;
    5369             : 
    5370             :             // Number of pixels that can be read/write simultaneously.
    5371           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    5372           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    5373           0 :             if (nSwathCols == 0)
    5374           0 :                 nSwathCols = nSrcBlockXSize;
    5375           0 :             if (nSwathCols > nXSize)
    5376           0 :                 nSwathCols = nXSize;
    5377             : 
    5378           0 :             CPLDebug(
    5379             :                 "GDAL",
    5380             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5381             :                 "too high block, "
    5382             :                 "use partial width at one time");
    5383             :         }
    5384           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    5385             :         {
    5386             :             /* Round on a multiple of nSrcBlockYSize */
    5387           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    5388           0 :             CPLDebug(
    5389             :                 "GDAL",
    5390             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5391             :                 "round nSwathLines to block height : %d",
    5392             :                 nSwathLines);
    5393             :         }
    5394             :     }
    5395        3373 :     else if (bDstIsCompressed)
    5396             :     {
    5397         419 :         if (nSwathLines < nBlockYSize)
    5398             :         {
    5399         146 :             nSwathLines = nBlockYSize;
    5400             : 
    5401             :             // Number of pixels that can be read/write simultaneously.
    5402         146 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    5403         146 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    5404         146 :             if (nSwathCols == 0)
    5405           0 :                 nSwathCols = nBlockXSize;
    5406         146 :             if (nSwathCols > nXSize)
    5407         146 :                 nSwathCols = nXSize;
    5408             : 
    5409         146 :             CPLDebug(
    5410             :                 "GDAL",
    5411             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5412             :                 "too high block, "
    5413             :                 "use partial width at one time");
    5414             :         }
    5415         273 :         else if ((nSwathLines % nBlockYSize) != 0)
    5416             :         {
    5417             :             // Round on a multiple of nBlockYSize.
    5418           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    5419           9 :             CPLDebug(
    5420             :                 "GDAL",
    5421             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5422             :                 "round nSwathLines to block height : %d",
    5423             :                 nSwathLines);
    5424             :         }
    5425             :     }
    5426             : 
    5427        3375 :     *pnSwathCols = nSwathCols;
    5428        3375 :     *pnSwathLines = nSwathLines;
    5429        3375 : }
    5430             : 
    5431             : /************************************************************************/
    5432             : /*                     GDALDatasetCopyWholeRaster()                     */
    5433             : /************************************************************************/
    5434             : 
    5435             : /**
    5436             :  * \brief Copy all dataset raster data.
    5437             :  *
    5438             :  * This function copies the complete raster contents of one dataset to
    5439             :  * another similarly configured dataset.  The source and destination
    5440             :  * dataset must have the same number of bands, and the same width
    5441             :  * and height.  The bands do not have to have the same data type.
    5442             :  *
    5443             :  * This function is primarily intended to support implementation of
    5444             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    5445             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    5446             :  * performing the transfer in a pixel interleaved fashion.
    5447             :  *
    5448             :  * Currently the only papszOptions value supported are :
    5449             :  * <ul>
    5450             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    5451             :  * write access pattern (this does not modify the layout of the destination
    5452             :  * data)</li>
    5453             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block
    5454             :  * sizes to achieve best compression.</li>
    5455             :  * <li>"SKIP_HOLES=YES" to skip chunks
    5456             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    5457             :  * (GDAL &gt;= 2.2)</li>
    5458             :  * </ul>
    5459             :  * More options may be supported in the future.
    5460             :  *
    5461             :  * @param hSrcDS the source dataset
    5462             :  * @param hDstDS the destination dataset
    5463             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5464             :  * @param pfnProgress progress reporting function.
    5465             :  * @param pProgressData callback data for progress function.
    5466             :  *
    5467             :  * @return CE_None on success, or CE_Failure on failure.
    5468             :  */
    5469             : 
    5470        3347 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    5471             :                                               GDALDatasetH hDstDS,
    5472             :                                               CSLConstList papszOptions,
    5473             :                                               GDALProgressFunc pfnProgress,
    5474             :                                               void *pProgressData)
    5475             : 
    5476             : {
    5477        3347 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5478        3347 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5479             : 
    5480        3347 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    5481        3347 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    5482             : 
    5483        3347 :     if (pfnProgress == nullptr)
    5484           0 :         pfnProgress = GDALDummyProgress;
    5485             : 
    5486             :     /* -------------------------------------------------------------------- */
    5487             :     /*      Confirm the datasets match in size and band counts.             */
    5488             :     /* -------------------------------------------------------------------- */
    5489        3347 :     const int nXSize = poDstDS->GetRasterXSize();
    5490        3347 :     const int nYSize = poDstDS->GetRasterYSize();
    5491        3347 :     const int nBandCount = poDstDS->GetRasterCount();
    5492             : 
    5493        3347 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    5494        6694 :         poSrcDS->GetRasterYSize() != nYSize ||
    5495        3347 :         poSrcDS->GetRasterCount() != nBandCount)
    5496             :     {
    5497           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5498             :                  "Input and output dataset sizes or band counts do not\n"
    5499             :                  "match in GDALDatasetCopyWholeRaster()");
    5500           0 :         return CE_Failure;
    5501             :     }
    5502             : 
    5503             :     /* -------------------------------------------------------------------- */
    5504             :     /*      Report preliminary (0) progress.                                */
    5505             :     /* -------------------------------------------------------------------- */
    5506        3347 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5507             :     {
    5508           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5509             :                  "User terminated CreateCopy()");
    5510           1 :         return CE_Failure;
    5511             :     }
    5512             : 
    5513             :     /* -------------------------------------------------------------------- */
    5514             :     /*      Get our prototype band, and assume the others are similarly     */
    5515             :     /*      configured.                                                     */
    5516             :     /* -------------------------------------------------------------------- */
    5517        3346 :     if (nBandCount == 0)
    5518           0 :         return CE_None;
    5519             : 
    5520        3346 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    5521        3346 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    5522        3346 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5523             : 
    5524             :     /* -------------------------------------------------------------------- */
    5525             :     /*      Do we want to try and do the operation in a pixel               */
    5526             :     /*      interleaved fashion?                                            */
    5527             :     /* -------------------------------------------------------------------- */
    5528        3346 :     bool bInterleave = false;
    5529             :     const char *pszInterleave =
    5530        3346 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5531        3346 :     if (pszInterleave != nullptr &&
    5532        2942 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5533         209 :         bInterleave = true;
    5534             : 
    5535        3346 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5536        3346 :     if (pszInterleave != nullptr &&
    5537        2881 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5538         528 :         bInterleave = true;
    5539             : 
    5540        3346 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    5541        3346 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    5542           5 :         bInterleave = true;
    5543        3341 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    5544          13 :         bInterleave = false;
    5545             :     // attributes is specific to the TileDB driver
    5546        3328 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    5547           4 :         bInterleave = true;
    5548        3324 :     else if (pszInterleave != nullptr)
    5549             :     {
    5550           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    5551             :                  "Unsupported value for option INTERLEAVE");
    5552             :     }
    5553             : 
    5554             :     // If the destination is compressed, we must try to write blocks just once,
    5555             :     // to save disk space (GTiff case for example), and to avoid data loss
    5556             :     // (JPEG compression for example).
    5557        3346 :     bool bDstIsCompressed = false;
    5558             :     const char *pszDstCompressed =
    5559        3346 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    5560        3346 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5561         393 :         bDstIsCompressed = true;
    5562             : 
    5563             :     /* -------------------------------------------------------------------- */
    5564             :     /*      What will our swath size be?                                    */
    5565             :     /* -------------------------------------------------------------------- */
    5566             : 
    5567        3346 :     int nSwathCols = 0;
    5568        3346 :     int nSwathLines = 0;
    5569        3346 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    5570             :                                     nBandCount, bDstIsCompressed, bInterleave,
    5571             :                                     &nSwathCols, &nSwathLines);
    5572             : 
    5573        3346 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5574        3346 :     if (bInterleave)
    5575         583 :         nPixelSize *= nBandCount;
    5576             : 
    5577        3346 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5578        3346 :     if (pSwathBuf == nullptr)
    5579             :     {
    5580           0 :         return CE_Failure;
    5581             :     }
    5582             : 
    5583        3346 :     CPLDebug("GDAL",
    5584             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    5585             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    5586             : 
    5587             :     // Advise the source raster that we are going to read it completely
    5588             :     // Note: this might already have been done by GDALCreateCopy() in the
    5589             :     // likely case this function is indirectly called by it
    5590        3346 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    5591        3346 :                         nullptr, nullptr);
    5592             : 
    5593             :     /* ==================================================================== */
    5594             :     /*      Band oriented (uninterleaved) case.                             */
    5595             :     /* ==================================================================== */
    5596        3346 :     CPLErr eErr = CE_None;
    5597             :     const bool bCheckHoles =
    5598        3346 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5599             : 
    5600        3346 :     if (!bInterleave)
    5601             :     {
    5602             :         GDALRasterIOExtraArg sExtraArg;
    5603        2763 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5604        2763 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5605             : 
    5606        8289 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    5607        2763 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    5608        2763 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    5609        2763 :         GIntBig nBlocksDone = 0;
    5610             : 
    5611        7969 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    5612             :         {
    5613        5206 :             int nBand = iBand + 1;
    5614             : 
    5615       10675 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5616             :             {
    5617        5469 :                 int nThisLines = nSwathLines;
    5618             : 
    5619        5469 :                 if (iY + nThisLines > nYSize)
    5620         368 :                     nThisLines = nYSize - iY;
    5621             : 
    5622       10938 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    5623        5469 :                      iX += nSwathCols)
    5624             :                 {
    5625        5469 :                     int nThisCols = nSwathCols;
    5626             : 
    5627        5469 :                     if (iX + nThisCols > nXSize)
    5628           0 :                         nThisCols = nXSize - iX;
    5629             : 
    5630        5469 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5631        5469 :                     if (bCheckHoles)
    5632             :                     {
    5633             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    5634        3757 :                                       ->GetDataCoverageStatus(
    5635             :                                           iX, iY, nThisCols, nThisLines,
    5636             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    5637             :                     }
    5638        5469 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5639             :                     {
    5640        5465 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    5641       10930 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    5642        5465 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    5643        5465 :                             (nBlocksDone + 0.5) /
    5644        5465 :                                 static_cast<double>(nTotalBlocks),
    5645             :                             pfnProgress, pProgressData);
    5646        5465 :                         if (sExtraArg.pProgressData == nullptr)
    5647        1682 :                             sExtraArg.pfnProgress = nullptr;
    5648             : 
    5649        5465 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5650             :                                                  nThisLines, pSwathBuf,
    5651             :                                                  nThisCols, nThisLines, eDT, 1,
    5652             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    5653             : 
    5654        5465 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5655             : 
    5656        5465 :                         if (eErr == CE_None)
    5657        5458 :                             eErr = poDstDS->RasterIO(
    5658             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    5659             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    5660             :                                 &nBand, 0, 0, 0, nullptr);
    5661             :                     }
    5662             : 
    5663        5469 :                     nBlocksDone++;
    5664       10896 :                     if (eErr == CE_None &&
    5665        5427 :                         !pfnProgress(nBlocksDone /
    5666        5427 :                                          static_cast<double>(nTotalBlocks),
    5667             :                                      nullptr, pProgressData))
    5668             :                     {
    5669           2 :                         eErr = CE_Failure;
    5670           2 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    5671             :                                  "User terminated CreateCopy()");
    5672             :                     }
    5673             :                 }
    5674             :             }
    5675             :         }
    5676             :     }
    5677             : 
    5678             :     /* ==================================================================== */
    5679             :     /*      Pixel interleaved case.                                         */
    5680             :     /* ==================================================================== */
    5681             :     else /* if( bInterleave ) */
    5682             :     {
    5683             :         GDALRasterIOExtraArg sExtraArg;
    5684         583 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5685         583 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5686             : 
    5687         583 :         const GIntBig nTotalBlocks =
    5688         583 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    5689         583 :             DIV_ROUND_UP(nXSize, nSwathCols);
    5690         583 :         GIntBig nBlocksDone = 0;
    5691             : 
    5692        1388 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5693             :         {
    5694         805 :             int nThisLines = nSwathLines;
    5695             : 
    5696         805 :             if (iY + nThisLines > nYSize)
    5697         198 :                 nThisLines = nYSize - iY;
    5698             : 
    5699        1615 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5700             :             {
    5701         810 :                 int nThisCols = nSwathCols;
    5702             : 
    5703         810 :                 if (iX + nThisCols > nXSize)
    5704           3 :                     nThisCols = nXSize - iX;
    5705             : 
    5706         810 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5707         810 :                 if (bCheckHoles)
    5708             :                 {
    5709         551 :                     nStatus = 0;
    5710         604 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    5711             :                     {
    5712         585 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    5713         585 :                                        ->GetDataCoverageStatus(
    5714             :                                            iX, iY, nThisCols, nThisLines,
    5715             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    5716         585 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5717         532 :                             break;
    5718             :                     }
    5719             :                 }
    5720         810 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5721             :                 {
    5722         791 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    5723        1582 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    5724         791 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    5725         791 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    5726             :                         pfnProgress, pProgressData);
    5727         791 :                     if (sExtraArg.pProgressData == nullptr)
    5728         375 :                         sExtraArg.pfnProgress = nullptr;
    5729             : 
    5730         791 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5731             :                                              nThisLines, pSwathBuf, nThisCols,
    5732             :                                              nThisLines, eDT, nBandCount,
    5733             :                                              nullptr, 0, 0, 0, &sExtraArg);
    5734             : 
    5735         791 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5736             : 
    5737         791 :                     if (eErr == CE_None)
    5738         790 :                         eErr = poDstDS->RasterIO(
    5739             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    5740             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    5741             :                             0, 0, nullptr);
    5742             :                 }
    5743             : 
    5744         810 :                 nBlocksDone++;
    5745        1615 :                 if (eErr == CE_None &&
    5746         805 :                     !pfnProgress(nBlocksDone /
    5747         805 :                                      static_cast<double>(nTotalBlocks),
    5748             :                                  nullptr, pProgressData))
    5749             :                 {
    5750           1 :                     eErr = CE_Failure;
    5751           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    5752             :                              "User terminated CreateCopy()");
    5753             :                 }
    5754             :             }
    5755             :         }
    5756             :     }
    5757             : 
    5758             :     /* -------------------------------------------------------------------- */
    5759             :     /*      Cleanup                                                         */
    5760             :     /* -------------------------------------------------------------------- */
    5761        3346 :     CPLFree(pSwathBuf);
    5762             : 
    5763        3346 :     return eErr;
    5764             : }
    5765             : 
    5766             : /************************************************************************/
    5767             : /*                   GDALRasterBandCopyWholeRaster()                    */
    5768             : /************************************************************************/
    5769             : 
    5770             : /**
    5771             :  * \brief Copy a whole raster band
    5772             :  *
    5773             :  * This function copies the complete raster contents of one band to
    5774             :  * another similarly configured band.  The source and destination
    5775             :  * bands must have the same width and height.  The bands do not have
    5776             :  * to have the same data type.
    5777             :  *
    5778             :  * It implements efficient copying, in particular "chunking" the copy in
    5779             :  * substantial blocks.
    5780             :  *
    5781             :  * Currently the only papszOptions value supported are :
    5782             :  * <ul>
    5783             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    5784             :  * achieve best compression.</li>
    5785             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    5786             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    5787             :  * </ul>
    5788             :  *
    5789             :  * @param hSrcBand the source band
    5790             :  * @param hDstBand the destination band
    5791             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5792             :  * @param pfnProgress progress reporting function.
    5793             :  * @param pProgressData callback data for progress function.
    5794             :  *
    5795             :  * @return CE_None on success, or CE_Failure on failure.
    5796             :  */
    5797             : 
    5798          29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    5799             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    5800             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    5801             :     void *pProgressData)
    5802             : 
    5803             : {
    5804          29 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5805          29 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5806             : 
    5807          29 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    5808          29 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    5809          29 :     CPLErr eErr = CE_None;
    5810             : 
    5811          29 :     if (pfnProgress == nullptr)
    5812           2 :         pfnProgress = GDALDummyProgress;
    5813             : 
    5814             :     /* -------------------------------------------------------------------- */
    5815             :     /*      Confirm the datasets match in size and band counts.             */
    5816             :     /* -------------------------------------------------------------------- */
    5817          29 :     int nXSize = poSrcBand->GetXSize();
    5818          29 :     int nYSize = poSrcBand->GetYSize();
    5819             : 
    5820          29 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    5821             :     {
    5822           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5823             :                  "Input and output band sizes do not\n"
    5824             :                  "match in GDALRasterBandCopyWholeRaster()");
    5825           0 :         return CE_Failure;
    5826             :     }
    5827             : 
    5828             :     /* -------------------------------------------------------------------- */
    5829             :     /*      Report preliminary (0) progress.                                */
    5830             :     /* -------------------------------------------------------------------- */
    5831          29 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5832             :     {
    5833           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5834             :                  "User terminated CreateCopy()");
    5835           0 :         return CE_Failure;
    5836             :     }
    5837             : 
    5838          29 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5839             : 
    5840             :     // If the destination is compressed, we must try to write blocks just once,
    5841             :     // to save disk space (GTiff case for example), and to avoid data loss
    5842             :     // (JPEG compression for example).
    5843          29 :     bool bDstIsCompressed = false;
    5844             :     const char *pszDstCompressed =
    5845          29 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5846          29 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5847          26 :         bDstIsCompressed = true;
    5848             : 
    5849             :     /* -------------------------------------------------------------------- */
    5850             :     /*      What will our swath size be?                                    */
    5851             :     /* -------------------------------------------------------------------- */
    5852             : 
    5853          29 :     int nSwathCols = 0;
    5854          29 :     int nSwathLines = 0;
    5855          29 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5856             :                                     FALSE, &nSwathCols, &nSwathLines);
    5857             : 
    5858          29 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5859             : 
    5860          29 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5861          29 :     if (pSwathBuf == nullptr)
    5862             :     {
    5863           0 :         return CE_Failure;
    5864             :     }
    5865             : 
    5866          29 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5867             :              nSwathCols, nSwathLines);
    5868             : 
    5869             :     const bool bCheckHoles =
    5870          29 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5871             : 
    5872             :     // Advise the source raster that we are going to read it completely
    5873          29 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    5874             : 
    5875             :     /* ==================================================================== */
    5876             :     /*      Band oriented (uninterleaved) case.                             */
    5877             :     /* ==================================================================== */
    5878             : 
    5879          72 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5880             :     {
    5881          43 :         int nThisLines = nSwathLines;
    5882             : 
    5883          43 :         if (iY + nThisLines > nYSize)
    5884           8 :             nThisLines = nYSize - iY;
    5885             : 
    5886          86 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5887             :         {
    5888          43 :             int nThisCols = nSwathCols;
    5889             : 
    5890          43 :             if (iX + nThisCols > nXSize)
    5891           0 :                 nThisCols = nXSize - iX;
    5892             : 
    5893          43 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5894          43 :             if (bCheckHoles)
    5895             :             {
    5896           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5897             :                     iX, iY, nThisCols, nThisLines,
    5898             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5899             :             }
    5900          43 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5901             :             {
    5902          43 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5903             :                                            nThisLines, pSwathBuf, nThisCols,
    5904             :                                            nThisLines, eDT, 0, 0, nullptr);
    5905             : 
    5906          43 :                 if (eErr == CE_None)
    5907          43 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5908             :                                                nThisLines, pSwathBuf, nThisCols,
    5909             :                                                nThisLines, eDT, 0, 0, nullptr);
    5910             :             }
    5911             : 
    5912          86 :             if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
    5913          43 :                                                     static_cast<double>(nYSize),
    5914             :                                                 nullptr, pProgressData))
    5915             :             {
    5916           0 :                 eErr = CE_Failure;
    5917           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5918             :                          "User terminated CreateCopy()");
    5919             :             }
    5920             :         }
    5921             :     }
    5922             : 
    5923             :     /* -------------------------------------------------------------------- */
    5924             :     /*      Cleanup                                                         */
    5925             :     /* -------------------------------------------------------------------- */
    5926          29 :     CPLFree(pSwathBuf);
    5927             : 
    5928          29 :     return eErr;
    5929             : }
    5930             : 
    5931             : /************************************************************************/
    5932             : /*                     GDALCopyRasterIOExtraArg ()                      */
    5933             : /************************************************************************/
    5934             : 
    5935      533484 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5936             :                               const GDALRasterIOExtraArg *psSrcArg)
    5937             : {
    5938      533484 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5939      533484 :     if (psSrcArg)
    5940             :     {
    5941      533484 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5942      533484 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5943      533484 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5944      533484 :         psDestArg->bFloatingPointWindowValidity =
    5945      533484 :             psSrcArg->bFloatingPointWindowValidity;
    5946      533484 :         if (psSrcArg->bFloatingPointWindowValidity)
    5947             :         {
    5948      210512 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5949      210512 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5950      210512 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5951      210512 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5952             :         }
    5953      533484 :         if (psSrcArg->nVersion >= 2)
    5954             :         {
    5955      533484 :             psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
    5956             :         }
    5957      533484 :         if (psSrcArg->nVersion >= 3)
    5958             :         {
    5959      533484 :             psDestArg->bOperateInBufType = psSrcArg->bOperateInBufType;
    5960             :         }
    5961             :     }
    5962      533484 : }
    5963             : 
    5964             : /************************************************************************/
    5965             : /*                           HasOnlyNoData()                            */
    5966             : /************************************************************************/
    5967             : 
    5968    51285976 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    5969             : {
    5970    51285976 :     return value == noDataValue;
    5971             : }
    5972             : 
    5973        5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
    5974             : {
    5975             :     using std::isnan;
    5976        5509 :     return isnan(noDataValue) ? isnan(value) : value == noDataValue;
    5977             : }
    5978             : 
    5979      251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    5980             : {
    5981      251221 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5982             : }
    5983             : 
    5984      264257 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    5985             : {
    5986      264257 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5987             : }
    5988             : 
    5989             : template <class T>
    5990       12024 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    5991             :                            size_t nHeight, size_t nLineStride,
    5992             :                            size_t nComponents)
    5993             : {
    5994             :     // Fast test: check the 4 corners and the middle pixel.
    5995       23297 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    5996             :     {
    5997       24095 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    5998       11880 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    5999       11750 :                               noDataValue) &&
    6000       11750 :               IsEqualToNoData(
    6001       11750 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    6002       11750 :                               nComponents +
    6003             :                           iBand],
    6004       11276 :                   noDataValue) &&
    6005       11276 :               IsEqualToNoData(
    6006       11276 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    6007             :                   noDataValue) &&
    6008       11276 :               IsEqualToNoData(
    6009       11276 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    6010       11276 :                               nComponents +
    6011             :                           iBand],
    6012             :                   noDataValue)))
    6013             :         {
    6014         942 :             return false;
    6015             :         }
    6016             :     }
    6017             : 
    6018             :     // Test all pixels.
    6019       52954 :     for (size_t iY = 0; iY < nHeight; iY++)
    6020             :     {
    6021       41993 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    6022    51790448 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    6023             :         {
    6024    51748615 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    6025             :             {
    6026         121 :                 return false;
    6027             :             }
    6028             :         }
    6029             :     }
    6030       10961 :     return true;
    6031             : }
    6032             : 
    6033             : /************************************************************************/
    6034             : /*                      GDALBufferHasOnlyNoData()                       */
    6035             : /************************************************************************/
    6036             : 
    6037       43909 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    6038             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    6039             :                              size_t nComponents, int nBitsPerSample,
    6040             :                              GDALBufferSampleFormat nSampleFormat)
    6041             : {
    6042             :     // In the case where the nodata is 0, we can compare several bytes at
    6043             :     // once. Select the largest natural integer type for the architecture.
    6044       43909 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6045             :         // Do not use this optimized code path for floating point numbers,
    6046             :         // as it can't detect negative zero.
    6047             :         nSampleFormat != GSF_FLOATING_POINT)
    6048             :     {
    6049       27265 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6050       27265 :         const size_t nSize =
    6051       27265 :             static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
    6052       27265 :                                      nComponents * nBitsPerSample +
    6053             :                                  7) /
    6054             :                                 8);
    6055             : #ifdef HAVE_SSE2
    6056       27265 :         size_t n = nSize;
    6057             :         // Align to 16 bytes
    6058       27328 :         while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
    6059             :         {
    6060          73 :             --n;
    6061          73 :             if (*pabyBuffer)
    6062          10 :                 return false;
    6063          63 :             pabyBuffer++;
    6064             :         }
    6065             : 
    6066       27255 :         const auto zero = _mm_setzero_si128();
    6067       27255 :         constexpr int UNROLLING = 4;
    6068     2223230 :         while (n >= UNROLLING * sizeof(zero))
    6069             :         {
    6070     2207980 :             const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6071             :                 pabyBuffer + 0 * sizeof(zero)));
    6072     2207980 :             const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6073     2207980 :                 pabyBuffer + 1 * sizeof(zero)));
    6074     2207980 :             const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6075     2207980 :                 pabyBuffer + 2 * sizeof(zero)));
    6076     2207980 :             const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6077     2207980 :                 pabyBuffer + 3 * sizeof(zero)));
    6078             :             const auto v =
    6079     6623930 :                 _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6080             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6081             :             if (!_mm_test_all_zeros(v, v))
    6082             : #else
    6083     4415960 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6084             : #endif
    6085             :             {
    6086       12001 :                 return false;
    6087             :             }
    6088     2195980 :             pabyBuffer += UNROLLING * sizeof(zero);
    6089     2195980 :             n -= UNROLLING * sizeof(zero);
    6090             :         }
    6091             : 
    6092      233639 :         while (n > 0)
    6093             :         {
    6094      218489 :             --n;
    6095      218489 :             if (*pabyBuffer)
    6096         104 :                 return false;
    6097      218385 :             pabyBuffer++;
    6098             :         }
    6099             : #else
    6100             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    6101             :         // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    6102             :         typedef std::uint64_t WordType;
    6103             : #else
    6104             :         typedef std::uint32_t WordType;
    6105             : #endif
    6106             : 
    6107             :         const size_t nInitialIters =
    6108             :             std::min(sizeof(WordType) -
    6109             :                          static_cast<size_t>(
    6110             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    6111             :                              sizeof(WordType)),
    6112             :                      nSize);
    6113             :         size_t i = 0;
    6114             :         for (; i < nInitialIters; i++)
    6115             :         {
    6116             :             if (pabyBuffer[i])
    6117             :                 return false;
    6118             :         }
    6119             :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    6120             :         {
    6121             :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    6122             :                 return false;
    6123             :         }
    6124             :         for (; i < nSize; i++)
    6125             :         {
    6126             :             if (pabyBuffer[i])
    6127             :                 return false;
    6128             :         }
    6129             : #endif
    6130       15150 :         return true;
    6131             :     }
    6132             : 
    6133             : #ifdef HAVE_SSE2
    6134       16644 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6135         708 :              nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6136             :     {
    6137         708 :         const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
    6138         708 :         const auto zero = _mm_setzero_si128();
    6139         708 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6140         708 :         const size_t n = nWidth * nHeight * nComponents;
    6141             : 
    6142         708 :         size_t i = 0;
    6143         708 :         constexpr int UNROLLING = 4;
    6144         708 :         constexpr size_t VALUES_PER_ITER =
    6145             :             UNROLLING * sizeof(zero) / sizeof(float);
    6146       24983 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    6147             :         {
    6148       24934 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6149             :                 pabyBuffer + 0 * sizeof(zero)));
    6150       24934 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6151       24934 :                 pabyBuffer + 1 * sizeof(zero)));
    6152       24934 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6153       24934 :                 pabyBuffer + 2 * sizeof(zero)));
    6154       24934 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6155       24934 :                 pabyBuffer + 3 * sizeof(zero)));
    6156       74802 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6157             :             // Clear the sign bit (makes -0.0 become +0.0)
    6158       24934 :             v = _mm_and_si128(v, signMask);
    6159             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6160             :             if (!_mm_test_all_zeros(v, v))
    6161             : #else
    6162       49868 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6163             : #endif
    6164             :             {
    6165         659 :                 return false;
    6166             :             }
    6167       24275 :             pabyBuffer += UNROLLING * sizeof(zero);
    6168             :         }
    6169             : 
    6170         304 :         for (; i < n; i++)
    6171             :         {
    6172             :             uint32_t bits;
    6173         272 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6174         272 :             pabyBuffer += sizeof(bits);
    6175         272 :             if ((bits & 0x7FFFFFFF) != 0)
    6176          17 :                 return false;
    6177             :         }
    6178             : 
    6179          32 :         return true;
    6180             :     }
    6181             : 
    6182       15936 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6183        3905 :              nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6184             :     {
    6185        3905 :         const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
    6186        3905 :         const auto zero = _mm_setzero_si128();
    6187        3905 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6188        3905 :         const size_t n = nWidth * nHeight * nComponents;
    6189             : 
    6190        3905 :         size_t i = 0;
    6191        3905 :         constexpr int UNROLLING = 4;
    6192        3905 :         constexpr size_t VALUES_PER_ITER =
    6193             :             UNROLLING * sizeof(zero) / sizeof(double);
    6194     1664570 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    6195             :         {
    6196     1660950 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6197             :                 pabyBuffer + 0 * sizeof(zero)));
    6198     1660950 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6199     1660950 :                 pabyBuffer + 1 * sizeof(zero)));
    6200     1660950 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6201     1660950 :                 pabyBuffer + 2 * sizeof(zero)));
    6202     1660950 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6203     1660950 :                 pabyBuffer + 3 * sizeof(zero)));
    6204     4982850 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6205             :             // Clear the sign bit (makes -0.0 become +0.0)
    6206     1660950 :             v = _mm_and_si128(v, signMask);
    6207             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6208             :             if (!_mm_test_all_zeros(v, v))
    6209             : #else
    6210     3321900 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6211             : #endif
    6212             :             {
    6213         289 :                 return false;
    6214             :             }
    6215     1660660 :             pabyBuffer += UNROLLING * sizeof(zero);
    6216             :         }
    6217             : 
    6218        3643 :         for (; i < n; i++)
    6219             :         {
    6220             :             uint64_t bits;
    6221          34 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6222          34 :             pabyBuffer += sizeof(bits);
    6223          34 :             if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
    6224           7 :                 return false;
    6225             :         }
    6226             : 
    6227        3609 :         return true;
    6228             :     }
    6229             : #endif
    6230             : 
    6231       12031 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    6232             :     {
    6233       22424 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    6234       11212 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    6235       11212 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    6236       11212 :                               nHeight, nLineStride, nComponents);
    6237             :     }
    6238         819 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    6239             :     {
    6240             :         // Use unsigned implementation by converting the nodatavalue to
    6241             :         // unsigned
    6242         119 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    6243          59 :                HasOnlyNoDataT(
    6244             :                    static_cast<const uint8_t *>(pBuffer),
    6245          59 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    6246          60 :                    nWidth, nHeight, nLineStride, nComponents);
    6247             :     }
    6248         759 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    6249             :     {
    6250          23 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    6251          11 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    6252          11 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    6253          12 :                               nHeight, nLineStride, nComponents);
    6254             :     }
    6255         747 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    6256             :     {
    6257             :         // Use unsigned implementation by converting the nodatavalue to
    6258             :         // unsigned
    6259         111 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    6260          55 :                HasOnlyNoDataT(
    6261             :                    static_cast<const uint16_t *>(pBuffer),
    6262          55 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    6263          56 :                    nWidth, nHeight, nLineStride, nComponents);
    6264             :     }
    6265         691 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    6266             :     {
    6267         129 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    6268          64 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    6269             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    6270          65 :                               nHeight, nLineStride, nComponents);
    6271             :     }
    6272         626 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    6273             :     {
    6274             :         // Use unsigned implementation by converting the nodatavalue to
    6275             :         // unsigned
    6276          23 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    6277          11 :                HasOnlyNoDataT(
    6278             :                    static_cast<const uint32_t *>(pBuffer),
    6279          11 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    6280          12 :                    nWidth, nHeight, nLineStride, nComponents);
    6281             :     }
    6282         614 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    6283             :     {
    6284         112 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    6285          56 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    6286             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    6287          56 :                               nHeight, nLineStride, nComponents);
    6288             :     }
    6289         558 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    6290             :     {
    6291             :         // Use unsigned implementation by converting the nodatavalue to
    6292             :         // unsigned
    6293           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    6294           0 :                HasOnlyNoDataT(
    6295             :                    static_cast<const uint64_t *>(pBuffer),
    6296           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    6297           0 :                    nWidth, nHeight, nLineStride, nComponents);
    6298             :     }
    6299         558 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
    6300             :     {
    6301         106 :         return (std::isnan(dfNoDataValue) ||
    6302         211 :                 GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
    6303         105 :                HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
    6304             :                               static_cast<GFloat16>(dfNoDataValue), nWidth,
    6305         106 :                               nHeight, nLineStride, nComponents);
    6306             :     }
    6307         452 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6308             :     {
    6309         268 :         return (std::isnan(dfNoDataValue) ||
    6310         535 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    6311         267 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    6312             :                               static_cast<float>(dfNoDataValue), nWidth,
    6313         268 :                               nHeight, nLineStride, nComponents);
    6314             :     }
    6315         184 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6316             :     {
    6317         184 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    6318             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    6319         184 :                               nComponents);
    6320             :     }
    6321           0 :     return false;
    6322             : }
    6323             : 
    6324             : #ifdef HAVE_SSE2
    6325             : 
    6326             : /************************************************************************/
    6327             : /*                       GDALDeinterleave3Byte()                        */
    6328             : /************************************************************************/
    6329             : 
    6330             : #if defined(__GNUC__) && !defined(__clang__)
    6331             : __attribute__((optimize("no-tree-vectorize")))
    6332             : #endif
    6333      380714 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6334             :                                   GByte *CPL_RESTRICT pabyDest0,
    6335             :                                   GByte *CPL_RESTRICT pabyDest1,
    6336             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6337             : #ifdef USE_NEON_OPTIMIZATIONS
    6338             : {
    6339             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6340             :                                        nIters);
    6341             : }
    6342             : #else
    6343             : {
    6344             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6345      380714 :     if (CPLHaveRuntimeSSSE3())
    6346             :     {
    6347      380712 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6348      380712 :                                            pabyDest2, nIters);
    6349             :     }
    6350             : #endif
    6351             : 
    6352           2 :     size_t i = 0;
    6353           2 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    6354           2 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    6355           2 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    6356           2 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    6357             :          sizeof(unsigned int)) == 0)
    6358             :     {
    6359             :         // Slightly better than GCC autovectorizer
    6360          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    6361             :         {
    6362          15 :             unsigned int word0 =
    6363          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    6364          15 :             unsigned int word1 =
    6365          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    6366          15 :             unsigned int word2 =
    6367          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    6368          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    6369          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    6370          15 :                 ((word2 >> 8) << 24);
    6371          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    6372          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    6373          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    6374          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    6375          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    6376          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    6377          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    6378             :         }
    6379             :     }
    6380             : #if defined(__clang__)
    6381             : #pragma clang loop vectorize(disable)
    6382             : #endif
    6383           3 :     for (; i < nIters; ++i)
    6384             :     {
    6385           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    6386           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    6387           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    6388             :     }
    6389             : }
    6390             : #endif
    6391             : 
    6392             : /************************************************************************/
    6393             : /*                       GDALDeinterleave4Byte()                        */
    6394             : /************************************************************************/
    6395             : 
    6396             : #if !defined(__GNUC__) || defined(__clang__)
    6397             : 
    6398             : /************************************************************************/
    6399             : /*                            deinterleave()                            */
    6400             : /************************************************************************/
    6401             : 
    6402             : template <bool SHIFT, bool MASK>
    6403             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    6404             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    6405             : {
    6406             :     // Set higher 24bit of each int32 packed word to 0
    6407             :     if (SHIFT)
    6408             :     {
    6409             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    6410             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    6411             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    6412             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    6413             :     }
    6414             :     __m128i xmm0;
    6415             :     __m128i xmm1;
    6416             :     __m128i xmm2;
    6417             :     __m128i xmm3;
    6418             :     if (MASK)
    6419             :     {
    6420             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    6421             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    6422             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    6423             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    6424             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    6425             :     }
    6426             :     else
    6427             :     {
    6428             :         xmm0 = xmm0_ori;
    6429             :         xmm1 = xmm1_ori;
    6430             :         xmm2 = xmm2_ori;
    6431             :         xmm3 = xmm3_ori;
    6432             :     }
    6433             :     // Pack int32 to int16
    6434             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    6435             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    6436             :     // Pack int16 to uint8
    6437             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    6438             :     return xmm0;
    6439             : }
    6440             : 
    6441             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6442             :                                   GByte *CPL_RESTRICT pabyDest0,
    6443             :                                   GByte *CPL_RESTRICT pabyDest1,
    6444             :                                   GByte *CPL_RESTRICT pabyDest2,
    6445             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6446             : #ifdef USE_NEON_OPTIMIZATIONS
    6447             : {
    6448             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6449             :                                        pabyDest3, nIters);
    6450             : }
    6451             : #else
    6452             : {
    6453             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6454             :     if (CPLHaveRuntimeSSSE3())
    6455             :     {
    6456             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6457             :                                            pabyDest2, pabyDest3, nIters);
    6458             :     }
    6459             : #endif
    6460             : 
    6461             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    6462             :     // do something slightly better.
    6463             :     size_t i = 0;
    6464             :     for (; i + 15 < nIters; i += 16)
    6465             :     {
    6466             :         __m128i xmm0_ori = _mm_loadu_si128(
    6467             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    6468             :         __m128i xmm1_ori = _mm_loadu_si128(
    6469             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    6470             :         __m128i xmm2_ori = _mm_loadu_si128(
    6471             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    6472             :         __m128i xmm3_ori = _mm_loadu_si128(
    6473             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    6474             : 
    6475             :         _mm_storeu_si128(
    6476             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    6477             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6478             :         _mm_storeu_si128(
    6479             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    6480             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6481             :         _mm_storeu_si128(
    6482             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    6483             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6484             :         _mm_storeu_si128(
    6485             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    6486             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6487             :     }
    6488             : 
    6489             : #if defined(__clang__)
    6490             : #pragma clang loop vectorize(disable)
    6491             : #endif
    6492             :     for (; i < nIters; ++i)
    6493             :     {
    6494             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6495             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6496             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6497             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6498             :     }
    6499             : }
    6500             : #endif
    6501             : #else
    6502             : // GCC autovectorizer does an excellent job
    6503       73219 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    6504             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    6505             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    6506             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6507             : {
    6508   539713000 :     for (size_t i = 0; i < nIters; ++i)
    6509             :     {
    6510   539640000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    6511   539640000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    6512   539640000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    6513   539640000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    6514             :     }
    6515       73219 : }
    6516             : #endif
    6517             : 
    6518             : #else
    6519             : 
    6520             : /************************************************************************/
    6521             : /*                       GDALDeinterleave3Byte()                        */
    6522             : /************************************************************************/
    6523             : 
    6524             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    6525             : // how to auto-vectorize
    6526             : // #if defined(__GNUC__)
    6527             : //__attribute__((optimize("tree-vectorize")))
    6528             : // #endif
    6529             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6530             :                                   GByte *CPL_RESTRICT pabyDest0,
    6531             :                                   GByte *CPL_RESTRICT pabyDest1,
    6532             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6533             : {
    6534             :     for (size_t i = 0; i < nIters; ++i)
    6535             :     {
    6536             :         pabyDest0[i] = pabySrc[3 * i + 0];
    6537             :         pabyDest1[i] = pabySrc[3 * i + 1];
    6538             :         pabyDest2[i] = pabySrc[3 * i + 2];
    6539             :     }
    6540             : }
    6541             : 
    6542             : /************************************************************************/
    6543             : /*                       GDALDeinterleave4Byte()                        */
    6544             : /************************************************************************/
    6545             : 
    6546             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    6547             : // how to auto-vectorize
    6548             : // #if defined(__GNUC__)
    6549             : //__attribute__((optimize("tree-vectorize")))
    6550             : // #endif
    6551             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6552             :                                   GByte *CPL_RESTRICT pabyDest0,
    6553             :                                   GByte *CPL_RESTRICT pabyDest1,
    6554             :                                   GByte *CPL_RESTRICT pabyDest2,
    6555             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6556             : {
    6557             :     for (size_t i = 0; i < nIters; ++i)
    6558             :     {
    6559             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6560             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6561             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6562             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6563             :     }
    6564             : }
    6565             : 
    6566             : #endif
    6567             : 
    6568             : /************************************************************************/
    6569             : /*                          GDALDeinterleave()                          */
    6570             : /************************************************************************/
    6571             : 
    6572             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    6573             :     buffers.
    6574             : 
    6575             :     In pseudo-code
    6576             :     \verbatim
    6577             :     for(size_t i = 0; i < nIters; ++i)
    6578             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    6579             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    6580             :     \endverbatim
    6581             : 
    6582             :     The implementation is optimized for a few cases, like de-interleaving
    6583             :     of 3 or 4-components Byte buffers.
    6584             : 
    6585             :     \since GDAL 3.6
    6586             :  */
    6587      454283 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    6588             :                       int nComponents, void **ppDestBuffer,
    6589             :                       GDALDataType eDestDT, size_t nIters)
    6590             : {
    6591      454283 :     if (eSourceDT == eDestDT)
    6592             :     {
    6593      454261 :         if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
    6594             :         {
    6595      453940 :             if (nComponents == 3)
    6596             :             {
    6597      380714 :                 const GByte *CPL_RESTRICT pabySrc =
    6598             :                     static_cast<const GByte *>(pSourceBuffer);
    6599      380714 :                 GByte *CPL_RESTRICT pabyDest0 =
    6600             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6601      380714 :                 GByte *CPL_RESTRICT pabyDest1 =
    6602             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6603      380714 :                 GByte *CPL_RESTRICT pabyDest2 =
    6604             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6605      380714 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6606             :                                       nIters);
    6607      380714 :                 return;
    6608             :             }
    6609       73226 :             else if (nComponents == 4)
    6610             :             {
    6611       73219 :                 const GByte *CPL_RESTRICT pabySrc =
    6612             :                     static_cast<const GByte *>(pSourceBuffer);
    6613       73219 :                 GByte *CPL_RESTRICT pabyDest0 =
    6614             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6615       73219 :                 GByte *CPL_RESTRICT pabyDest1 =
    6616             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6617       73219 :                 GByte *CPL_RESTRICT pabyDest2 =
    6618             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6619       73219 :                 GByte *CPL_RESTRICT pabyDest3 =
    6620             :                     static_cast<GByte *>(ppDestBuffer[3]);
    6621       73219 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6622             :                                       pabyDest3, nIters);
    6623       73219 :                 return;
    6624           7 :             }
    6625             :         }
    6626             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    6627             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    6628             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    6629         642 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    6630         321 :                  CPLHaveRuntimeSSSE3())
    6631             :         {
    6632         321 :             if (nComponents == 3)
    6633             :             {
    6634         126 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6635             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6636         126 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6637             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6638         126 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6639             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6640         126 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6641             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6642         126 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    6643             :                                               panDest2, nIters);
    6644         126 :                 return;
    6645             :             }
    6646             : #if !defined(__INTEL_CLANG_COMPILER)
    6647             :             // ICC autovectorizer doesn't do a good job, at least with icx
    6648             :             // 2022.1.0.20220316
    6649         195 :             else if (nComponents == 4)
    6650             :             {
    6651         195 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6652             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6653         195 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6654             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6655         195 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6656             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6657         195 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6658             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6659         195 :                 GUInt16 *CPL_RESTRICT panDest3 =
    6660             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    6661         195 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    6662             :                                               panDest2, panDest3, nIters);
    6663         195 :                 return;
    6664             :             }
    6665             : #endif
    6666             :         }
    6667             : #endif
    6668             :     }
    6669             : 
    6670          29 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    6671          29 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    6672         108 :     for (int iComp = 0; iComp < nComponents; iComp++)
    6673             :     {
    6674          79 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    6675          79 :                             iComp * nSourceDTSize,
    6676             :                         eSourceDT, nComponents * nSourceDTSize,
    6677          79 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    6678             :     }
    6679             : }
    6680             : 
    6681             : /************************************************************************/
    6682             : /*                   GDALTranspose2DSingleToSingle()                    */
    6683             : /************************************************************************/
    6684             : /**
    6685             :  * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
    6686             :  *
    6687             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6688             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6689             :  * @param nSrcWidth Width of pSrc array.
    6690             :  * @param nSrcHeight Height of pSrc array.
    6691             :  */
    6692             : 
    6693             : template <class DST, class SRC>
    6694         160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
    6695             :                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6696             :                                    size_t nSrcHeight)
    6697             : {
    6698         160 :     constexpr size_t blocksize = 32;
    6699         345 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6700             :     {
    6701         185 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6702        5016 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6703             :         {
    6704             :             // transpose the block beginning at [i,j]
    6705        4831 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6706       26185 :             for (size_t k = i; k < max_k; ++k)
    6707             :             {
    6708      669282 :                 for (size_t l = j; l < max_l; ++l)
    6709             :                 {
    6710      647928 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6711      647928 :                                  pDst[k + l * nSrcHeight]);
    6712             :                 }
    6713             :             }
    6714             :         }
    6715             :     }
    6716         160 : }
    6717             : 
    6718             : /************************************************************************/
    6719             : /*                  GDALTranspose2DComplexToComplex()                   */
    6720             : /************************************************************************/
    6721             : /**
    6722             :  * Transpose a 2D array of complex values into an array of complex values,
    6723             :  * in a efficient (cache-oblivious) way.
    6724             :  *
    6725             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6726             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6727             :  * @param nSrcWidth Width of pSrc array.
    6728             :  * @param nSrcHeight Height of pSrc array.
    6729             :  */
    6730             : template <class DST, class SRC>
    6731          25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
    6732             :                                      DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6733             :                                      size_t nSrcHeight)
    6734             : {
    6735          25 :     constexpr size_t blocksize = 32;
    6736          50 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6737             :     {
    6738          25 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6739          50 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6740             :         {
    6741             :             // transpose the block beginning at [i,j]
    6742          25 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6743          75 :             for (size_t k = i; k < max_k; ++k)
    6744             :             {
    6745         200 :                 for (size_t l = j; l < max_l; ++l)
    6746             :                 {
    6747         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6748         150 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6749         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
    6750         150 :                                  pDst[2 * (k + l * nSrcHeight) + 1]);
    6751             :                 }
    6752             :             }
    6753             :         }
    6754             :     }
    6755          25 : }
    6756             : 
    6757             : /************************************************************************/
    6758             : /*                   GDALTranspose2DComplexToSingle()                   */
    6759             : /************************************************************************/
    6760             : /**
    6761             :  * Transpose a 2D array of complex values into an array of non-complex values,
    6762             :  * in a efficient (cache-oblivious) way.
    6763             :  *
    6764             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6765             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6766             :  * @param nSrcWidth Width of pSrc array.
    6767             :  * @param nSrcHeight Height of pSrc array.
    6768             :  */
    6769             : template <class DST, class SRC>
    6770          55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
    6771             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6772             :                                     size_t nSrcHeight)
    6773             : {
    6774          55 :     constexpr size_t blocksize = 32;
    6775         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6776             :     {
    6777          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6778         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6779             :         {
    6780             :             // transpose the block beginning at [i,j]
    6781          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6782         165 :             for (size_t k = i; k < max_k; ++k)
    6783             :             {
    6784         440 :                 for (size_t l = j; l < max_l; ++l)
    6785             :                 {
    6786         330 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6787         330 :                                  pDst[k + l * nSrcHeight]);
    6788             :                 }
    6789             :             }
    6790             :         }
    6791             :     }
    6792          55 : }
    6793             : 
    6794             : /************************************************************************/
    6795             : /*                   GDALTranspose2DSingleToComplex()                   */
    6796             : /************************************************************************/
    6797             : /**
    6798             :  * Transpose a 2D array of non-complex values into an array of complex values,
    6799             :  * in a efficient (cache-oblivious) way.
    6800             :  *
    6801             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6802             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6803             :  * @param nSrcWidth Width of pSrc array.
    6804             :  * @param nSrcHeight Height of pSrc array.
    6805             :  */
    6806             : template <class DST, class SRC>
    6807          55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
    6808             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6809             :                                     size_t nSrcHeight)
    6810             : {
    6811          55 :     constexpr size_t blocksize = 32;
    6812         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6813             :     {
    6814          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6815         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6816             :         {
    6817             :             // transpose the block beginning at [i,j]
    6818          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6819         165 :             for (size_t k = i; k < max_k; ++k)
    6820             :             {
    6821         440 :                 for (size_t l = j; l < max_l; ++l)
    6822             :                 {
    6823         330 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6824         330 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6825         330 :                     pDst[2 * (k + l * nSrcHeight) + 1] = 0;
    6826             :                 }
    6827             :             }
    6828             :         }
    6829             :     }
    6830          55 : }
    6831             : 
    6832             : /************************************************************************/
    6833             : /*                          GDALTranspose2D()                           */
    6834             : /************************************************************************/
    6835             : 
    6836             : template <class DST, bool DST_IS_COMPLEX>
    6837         295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
    6838             :                             size_t nSrcWidth, size_t nSrcHeight)
    6839             : {
    6840             : #define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
    6841             :     do                                                                         \
    6842             :     {                                                                          \
    6843             :         if constexpr (DST_IS_COMPLEX)                                          \
    6844             :         {                                                                      \
    6845             :             GDALTranspose2DSingleToComplex(                                    \
    6846             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6847             :                 nSrcHeight);                                                   \
    6848             :         }                                                                      \
    6849             :         else                                                                   \
    6850             :         {                                                                      \
    6851             :             GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
    6852             :                                           pDst, nSrcWidth, nSrcHeight);        \
    6853             :         }                                                                      \
    6854             :     } while (0)
    6855             : 
    6856             : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
    6857             :     do                                                                         \
    6858             :     {                                                                          \
    6859             :         if constexpr (DST_IS_COMPLEX)                                          \
    6860             :         {                                                                      \
    6861             :             GDALTranspose2DComplexToComplex(                                   \
    6862             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6863             :                 nSrcHeight);                                                   \
    6864             :         }                                                                      \
    6865             :         else                                                                   \
    6866             :         {                                                                      \
    6867             :             GDALTranspose2DComplexToSingle(                                    \
    6868             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6869             :                 nSrcHeight);                                                   \
    6870             :         }                                                                      \
    6871             :     } while (0)
    6872             : 
    6873             :     // clang-format off
    6874         295 :     switch (eSrcType)
    6875             :     {
    6876          16 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t); break;
    6877          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
    6878          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
    6879          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
    6880          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
    6881          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
    6882          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
    6883          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
    6884          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
    6885          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
    6886          24 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
    6887          16 :         case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
    6888          16 :         case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
    6889          16 :         case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
    6890          16 :         case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
    6891          16 :         case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
    6892           0 :         case GDT_Unknown:
    6893             :         case GDT_TypeCount:
    6894           0 :             break;
    6895             :     }
    6896             :         // clang-format on
    6897             : 
    6898             : #undef CALL_GDALTranspose2D_internal
    6899             : #undef CALL_GDALTranspose2DComplex_internal
    6900         295 : }
    6901             : 
    6902             : /************************************************************************/
    6903             : /*                        GDALInterleave2Byte()                         */
    6904             : /************************************************************************/
    6905             : 
    6906             : #if defined(HAVE_SSE2) &&                                                      \
    6907             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6908             : 
    6909             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6910             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6911             : #if defined(__GNUC__)
    6912             : __attribute__((noinline))
    6913             : #endif
    6914             : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6915             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6916             : {
    6917             :     size_t i = 0;
    6918             :     constexpr size_t VALS_PER_ITER = 16;
    6919             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6920             :     {
    6921             :         __m128i xmm0 =
    6922             :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
    6923             :         __m128i xmm1 = _mm_loadu_si128(
    6924             :             reinterpret_cast<__m128i const *>(pSrc + i + nIters));
    6925             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
    6926             :                          _mm_unpacklo_epi8(xmm0, xmm1));
    6927             :         _mm_storeu_si128(
    6928             :             reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
    6929             :             _mm_unpackhi_epi8(xmm0, xmm1));
    6930             :     }
    6931             : #if defined(__clang__)
    6932             : #pragma clang loop vectorize(disable)
    6933             : #endif
    6934             :     for (; i < nIters; ++i)
    6935             :     {
    6936             :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6937             :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6938             :     }
    6939             : }
    6940             : 
    6941             : #else
    6942             : 
    6943             : #if defined(__GNUC__) && !defined(__clang__)
    6944             : __attribute__((optimize("tree-vectorize")))
    6945             : #endif
    6946             : #if defined(__GNUC__)
    6947             : __attribute__((noinline))
    6948             : #endif
    6949             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6950             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6951             : #pragma clang diagnostic push
    6952             : #pragma clang diagnostic ignored "-Wpass-failed"
    6953             : #endif
    6954           9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6955             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6956             : {
    6957             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6958             : #pragma clang loop vectorize(enable)
    6959             : #endif
    6960      355429 :     for (size_t i = 0; i < nIters; ++i)
    6961             :     {
    6962      355420 :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6963      355420 :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6964             :     }
    6965           9 : }
    6966             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6967             : #pragma clang diagnostic pop
    6968             : #endif
    6969             : 
    6970             : #endif
    6971             : 
    6972             : /************************************************************************/
    6973             : /*                        GDALInterleave4Byte()                         */
    6974             : /************************************************************************/
    6975             : 
    6976             : #if defined(HAVE_SSE2) &&                                                      \
    6977             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6978             : 
    6979             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6980             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6981             : #if defined(__GNUC__)
    6982             : __attribute__((noinline))
    6983             : #endif
    6984             : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6985             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6986             : {
    6987             :     size_t i = 0;
    6988             :     constexpr size_t VALS_PER_ITER = 16;
    6989             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6990             :     {
    6991             :         __m128i xmm0 = _mm_loadu_si128(
    6992             :             reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
    6993             :         __m128i xmm1 = _mm_loadu_si128(
    6994             :             reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
    6995             :         __m128i xmm2 = _mm_loadu_si128(
    6996             :             reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
    6997             :         __m128i xmm3 = _mm_loadu_si128(
    6998             :             reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
    6999             :         auto tmp0 = _mm_unpacklo_epi8(
    7000             :             xmm0,
    7001             :             xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
    7002             :         auto tmp1 = _mm_unpackhi_epi8(
    7003             :             xmm0,
    7004             :             xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
    7005             :         auto tmp2 = _mm_unpacklo_epi8(
    7006             :             xmm2,
    7007             :             xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
    7008             :         auto tmp3 = _mm_unpackhi_epi8(
    7009             :             xmm2,
    7010             :             xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
    7011             :         auto tmp2_0 = _mm_unpacklo_epi16(
    7012             :             tmp0,
    7013             :             tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
    7014             :         auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
    7015             :         auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
    7016             :         auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
    7017             :         _mm_storeu_si128(
    7018             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
    7019             :             tmp2_0);
    7020             :         _mm_storeu_si128(
    7021             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
    7022             :             tmp2_1);
    7023             :         _mm_storeu_si128(
    7024             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
    7025             :             tmp2_2);
    7026             :         _mm_storeu_si128(
    7027             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
    7028             :             tmp2_3);
    7029             :     }
    7030             : #if defined(__clang__)
    7031             : #pragma clang loop vectorize(disable)
    7032             : #endif
    7033             :     for (; i < nIters; ++i)
    7034             :     {
    7035             :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    7036             :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    7037             :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    7038             :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    7039             :     }
    7040             : }
    7041             : 
    7042             : #else
    7043             : 
    7044             : #if defined(__GNUC__) && !defined(__clang__)
    7045             : __attribute__((optimize("tree-vectorize")))
    7046             : #endif
    7047             : #if defined(__GNUC__)
    7048             : __attribute__((noinline))
    7049             : #endif
    7050             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7051             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    7052             : #pragma clang diagnostic push
    7053             : #pragma clang diagnostic ignored "-Wpass-failed"
    7054             : #endif
    7055          30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    7056             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    7057             : {
    7058             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7059             : #pragma clang loop vectorize(enable)
    7060             : #endif
    7061    49620700 :     for (size_t i = 0; i < nIters; ++i)
    7062             :     {
    7063    49620600 :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    7064    49620600 :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    7065    49620600 :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    7066    49620600 :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    7067             :     }
    7068          30 : }
    7069             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7070             : #pragma clang diagnostic pop
    7071             : #endif
    7072             : 
    7073             : #endif
    7074             : 
    7075             : /************************************************************************/
    7076             : /*                          GDALTranspose2D()                           */
    7077             : /************************************************************************/
    7078             : 
    7079             : /**
    7080             :  * Transpose a 2D array in a efficient (cache-oblivious) way.
    7081             :  *
    7082             :  * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
    7083             :  * @param eSrcType Data type of pSrc.
    7084             :  * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
    7085             :  * @param eDstType Data type of pDst.
    7086             :  * @param nSrcWidth Width of pSrc array.
    7087             :  * @param nSrcHeight Height of pSrc array.
    7088             :  * @since GDAL 3.11
    7089             :  */
    7090             : 
    7091         365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
    7092             :                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
    7093             : {
    7094         365 :     if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
    7095             :     {
    7096          70 :         if (nSrcHeight == 2)
    7097             :         {
    7098           9 :             GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
    7099             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    7100           9 :             return;
    7101             :         }
    7102          61 :         if (nSrcHeight == 4)
    7103             :         {
    7104          30 :             GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
    7105             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    7106          30 :             return;
    7107             :         }
    7108             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
    7109             :      (defined(__x86_64) || defined(_M_X64)))
    7110          31 :         if (CPLHaveRuntimeSSSE3())
    7111             :         {
    7112          31 :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    7113             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    7114             :                                        nSrcHeight);
    7115          31 :             return;
    7116             :         }
    7117             : #elif defined(USE_NEON_OPTIMIZATIONS)
    7118             :         {
    7119             :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    7120             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    7121             :                                        nSrcHeight);
    7122             :             return;
    7123             :         }
    7124             : #endif
    7125             :     }
    7126             : 
    7127             : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
    7128             :     GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
    7129             :         pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
    7130             : 
    7131             :     // clang-format off
    7132         295 :     switch (eDstType)
    7133             :     {
    7134          15 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
    7135          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
    7136          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
    7137          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
    7138          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
    7139          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
    7140          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
    7141          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
    7142          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
    7143          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
    7144          25 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
    7145          16 :         case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
    7146          16 :         case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
    7147          16 :         case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
    7148          16 :         case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
    7149          16 :         case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
    7150           0 :         case GDT_Unknown:
    7151             :         case GDT_TypeCount:
    7152           0 :             break;
    7153             :     }
    7154             :         // clang-format on
    7155             : 
    7156             : #undef CALL_GDALTranspose2D_internal
    7157             : }
    7158             : 
    7159             : /************************************************************************/
    7160             : /*                     ExtractBitAndConvertTo255()                      */
    7161             : /************************************************************************/
    7162             : 
    7163             : #if defined(__GNUC__) || defined(_MSC_VER)
    7164             : // Signedness of char implementation dependent, so be explicit.
    7165             : // Assumes 2-complement integer types and sign extension of right shifting
    7166             : // GCC guarantees such:
    7167             : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
    7168      143590 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    7169             : {
    7170      143590 :     return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
    7171      143590 :                               7);
    7172             : }
    7173             : #else
    7174             : // Portable way
    7175             : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    7176             : {
    7177             :     return (byVal & (1 << nBit)) ? 255 : 0;
    7178             : }
    7179             : #endif
    7180             : 
    7181             : /************************************************************************/
    7182             : /*                  ExpandEightPackedBitsToByteAt255()                  */
    7183             : /************************************************************************/
    7184             : 
    7185       17813 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
    7186             :                                                     GByte abyOutput[8])
    7187             : {
    7188       17813 :     abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
    7189       17813 :     abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
    7190       17813 :     abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
    7191       17813 :     abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
    7192       17813 :     abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
    7193       17813 :     abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
    7194       17813 :     abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
    7195       17813 :     abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
    7196       17813 : }
    7197             : 
    7198             : /************************************************************************/
    7199             : /*                 GDALExpandPackedBitsToByteAt0Or255()                 */
    7200             : /************************************************************************/
    7201             : 
    7202             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7203             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7204             :   at 1 to a byte at 255.
    7205             : 
    7206             :  The function does (in a possibly more optimized way) the following:
    7207             :  \code{.cpp}
    7208             :  for (size_t i = 0; i < nInputBits; ++i )
    7209             :  {
    7210             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
    7211             :  }
    7212             :  \endcode
    7213             : 
    7214             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7215             :  @param pabyOutput Output array of nInputBits bytes.
    7216             :  @param nInputBits Number of valid bits in pabyInput.
    7217             : 
    7218             :  @since 3.11
    7219             : */
    7220             : 
    7221       45357 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
    7222             :                                         GByte *CPL_RESTRICT pabyOutput,
    7223             :                                         size_t nInputBits)
    7224             : {
    7225       45357 :     const size_t nInputWholeBytes = nInputBits / 8;
    7226       45357 :     size_t iByte = 0;
    7227             : 
    7228             : #ifdef HAVE_SSE2
    7229             :     // Mask to isolate each bit
    7230       45357 :     const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
    7231             :                                           8, 16, 32, 64, -128);
    7232       45357 :     const __m128i zero = _mm_setzero_si128();
    7233       45357 :     const __m128i all_ones = _mm_set1_epi8(-1);
    7234             : #ifdef __SSSE3__
    7235             :     const __m128i dispatch_two_bytes =
    7236             :         _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
    7237             : #endif
    7238       45357 :     constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
    7239      135866 :     for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
    7240             :     {
    7241       90509 :         __m128i reg_ori = _mm_loadu_si128(
    7242       90509 :             reinterpret_cast<const __m128i *>(pabyInput + iByte));
    7243             : 
    7244       90509 :         constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
    7245      814581 :         for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
    7246             :         {
    7247             :             // Given reg_ori = (A, B, ... 14 other bytes ...),
    7248             :             // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
    7249             : #ifdef __SSSE3__
    7250             :             __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
    7251             : #else
    7252      724072 :             __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
    7253      724072 :             reg = _mm_unpacklo_epi16(reg, reg);
    7254      724072 :             reg = _mm_unpacklo_epi32(reg, reg);
    7255             : #endif
    7256             : 
    7257             :             // Test if bits of interest are set
    7258      724072 :             reg = _mm_and_si128(reg, bit_mask);
    7259             : 
    7260             :             // Now test if those bits are set, by comparing to zero. So the
    7261             :             // result will be that bytes where bits are set will be at 0, and
    7262             :             // ones where they are cleared will be at 0xFF. So the inverse of
    7263             :             // the end result we want!
    7264      724072 :             reg = _mm_cmpeq_epi8(reg, zero);
    7265             : 
    7266             :             // Invert the result
    7267      724072 :             reg = _mm_andnot_si128(reg, all_ones);
    7268             : 
    7269             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
    7270             : 
    7271      724072 :             pabyOutput += SSE_REG_SIZE;
    7272             : 
    7273             :             // Right-shift of 2 bytes
    7274      724072 :             reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
    7275             :         }
    7276             :     }
    7277             : 
    7278             : #endif  // HAVE_SSE2
    7279             : 
    7280       63170 :     for (; iByte < nInputWholeBytes; ++iByte)
    7281             :     {
    7282       17813 :         ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
    7283       17813 :         pabyOutput += 8;
    7284             :     }
    7285       46443 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7286             :     {
    7287        1086 :         *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
    7288        1086 :         ++pabyOutput;
    7289             :     }
    7290       45357 : }
    7291             : 
    7292             : /************************************************************************/
    7293             : /*                   ExpandEightPackedBitsToByteAt1()                   */
    7294             : /************************************************************************/
    7295             : 
    7296      136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
    7297             :                                                   GByte abyOutput[8])
    7298             : {
    7299      136113 :     abyOutput[0] = (byVal >> 7) & 0x1;
    7300      136113 :     abyOutput[1] = (byVal >> 6) & 0x1;
    7301      136113 :     abyOutput[2] = (byVal >> 5) & 0x1;
    7302      136113 :     abyOutput[3] = (byVal >> 4) & 0x1;
    7303      136113 :     abyOutput[4] = (byVal >> 3) & 0x1;
    7304      136113 :     abyOutput[5] = (byVal >> 2) & 0x1;
    7305      136113 :     abyOutput[6] = (byVal >> 1) & 0x1;
    7306      136113 :     abyOutput[7] = (byVal >> 0) & 0x1;
    7307      136113 : }
    7308             : 
    7309             : /************************************************************************/
    7310             : /*                  GDALExpandPackedBitsToByteAt0Or1()                  */
    7311             : /************************************************************************/
    7312             : 
    7313             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7314             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7315             :   at 1 to a byte at 1.
    7316             : 
    7317             :  The function does (in a possibly more optimized way) the following:
    7318             :  \code{.cpp}
    7319             :  for (size_t i = 0; i < nInputBits; ++i )
    7320             :  {
    7321             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
    7322             :  }
    7323             :  \endcode
    7324             : 
    7325             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7326             :  @param pabyOutput Output array of nInputBits bytes.
    7327             :  @param nInputBits Number of valid bits in pabyInput.
    7328             : 
    7329             :  @since 3.11
    7330             : */
    7331             : 
    7332        7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
    7333             :                                       GByte *CPL_RESTRICT pabyOutput,
    7334             :                                       size_t nInputBits)
    7335             : {
    7336        7033 :     const size_t nInputWholeBytes = nInputBits / 8;
    7337        7033 :     size_t iByte = 0;
    7338      143146 :     for (; iByte < nInputWholeBytes; ++iByte)
    7339             :     {
    7340      136113 :         ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
    7341      136113 :         pabyOutput += 8;
    7342             :     }
    7343       18886 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7344             :     {
    7345       11853 :         *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
    7346       11853 :         ++pabyOutput;
    7347             :     }
    7348        7033 : }

Generated by: LCOV version 1.14