LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2726 2983 91.4 %
Date: 2026-04-22 14:22:58 Functions: 690 730 94.5 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_float.h"
      36             : #include "cpl_progress.h"
      37             : #include "cpl_string.h"
      38             : #include "cpl_vsi.h"
      39             : #include "gdal_priv_templates.hpp"
      40             : #include "gdal_vrt.h"
      41             : #include "gdalwarper.h"
      42             : #include "memdataset.h"
      43             : #include "vrtdataset.h"
      44             : 
      45             : #if defined(__x86_64) || defined(_M_X64)
      46             : #include <emmintrin.h>
      47             : #include <immintrin.h>
      48             : #define HAVE_SSE2
      49             : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
      50             : #if (defined(__GNUC__) || defined(__clang__)) &&                               \
      51             :     defined(HAVE_AVX2_AT_COMPILE_TIME)
      52             : #define HAVE_AVX2_DISPATCH
      53             : #elif defined(_MSC_VER)
      54             : #include <intrin.h>
      55             : #define HAVE_AVX2_DISPATCH
      56             : #endif
      57             : #elif defined(USE_NEON_OPTIMIZATIONS)
      58             : #include "include_sse2neon.h"
      59             : #define HAVE_SSE2
      60             : #endif
      61             : 
      62             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      63             : #include "rasterio_ssse3.h"
      64             : #ifdef __SSSE3__
      65             : #include <tmmintrin.h>
      66             : #endif
      67             : #endif
      68             : 
      69             : #ifdef __SSE4_1__
      70             : #include <smmintrin.h>
      71             : #endif
      72             : 
      73             : #ifdef __GNUC__
      74             : #define CPL_NOINLINE __attribute__((noinline))
      75             : #else
      76             : #define CPL_NOINLINE
      77             : #endif
      78             : 
      79             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      80             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      81             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      82             : 
      83             : /************************************************************************/
      84             : /*                     DownsamplingIntegerXFactor()                     */
      85             : /************************************************************************/
      86             : 
      87             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      88      695850 : static bool DownsamplingIntegerXFactor(
      89             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      90             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      91             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      92             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      93             : {
      94      695850 :     const int nBandDataSize =
      95             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      96      695850 :     int nOuterLoopIters = nBufXSize - 1;
      97      695850 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      98             :     const GByte *CPL_RESTRICT pabySrcData;
      99      695850 :     int nEndBlockX = nBlockXSize + nStartBlockX;
     100             : 
     101      695850 :     if (iSrcX < nEndBlockX)
     102             :     {
     103      295062 :         CPLAssert(poBlock);
     104      295062 :         goto no_reload_block;
     105             :     }
     106      400788 :     goto reload_block;
     107             : 
     108             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
     109             :     // nRasterXSize - 1
     110     1265113 :     while (--nOuterLoopIters >= 1)
     111             :     {
     112      201834 :         iSrcX += nSrcXInc;
     113      201834 :         pabySrcData += nIncSrcOffset;
     114      201834 :         pabyDstData += nPixelSpace;
     115             : 
     116             :         /* --------------------------------------------------------------------
     117             :          */
     118             :         /*      Ensure we have the appropriate block loaded. */
     119             :         /* --------------------------------------------------------------------
     120             :          */
     121      201834 :         if (iSrcX >= nEndBlockX)
     122             :         {
     123      201834 :         reload_block:
     124             :         {
     125      615212 :             const int nLBlockX = iSrcX / nBlockXSize;
     126      615212 :             nStartBlockX = nLBlockX * nBlockXSize;
     127      615212 :             nEndBlockX = nStartBlockX + nBlockXSize;
     128             : 
     129      615212 :             if (poBlock != nullptr)
     130      341376 :                 poBlock->DropLock();
     131             : 
     132      615212 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     133      615212 :             if (poBlock == nullptr)
     134             :             {
     135           1 :                 return false;
     136             :             }
     137             :         }
     138             : 
     139      615211 :         no_reload_block:
     140             :             const GByte *pabySrcBlock =
     141     1265113 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     142     1265113 :             GPtrDiff_t iSrcOffset =
     143     1265113 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     144     1265113 :             pabySrcData = pabySrcBlock + iSrcOffset;
     145             :         }
     146             : 
     147             :         /* --------------------------------------------------------------------
     148             :          */
     149             :         /*      Copy the maximum run of pixels. */
     150             :         /* --------------------------------------------------------------------
     151             :          */
     152             : 
     153     1265113 :         const int nIters = std::min(
     154     1265113 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     155             :         if (bSameDataType)
     156             :         {
     157     1264670 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     158     1264670 :             if (nIters > 1)
     159             :             {
     160             :                 if (DATA_TYPE_SIZE == 1)
     161             :                 {
     162      326320 :                     pabySrcData += nIncSrcOffset;
     163      326320 :                     pabyDstData += nPixelSpace;
     164      326320 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     165      326320 :                                      nPixelSpace, nIters - 1);
     166      326320 :                     pabySrcData +=
     167      326320 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     168      326320 :                     pabyDstData +=
     169      326320 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     170             :                 }
     171             :                 else
     172             :                 {
     173     4395716 :                     for (int i = 0; i < nIters - 1; i++)
     174             :                     {
     175     4197550 :                         pabySrcData += nIncSrcOffset;
     176     4197550 :                         pabyDstData += nPixelSpace;
     177     4197550 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     178             :                     }
     179             :                 }
     180      524490 :                 iSrcX += nSrcXInc * (nIters - 1);
     181      524490 :                 nOuterLoopIters -= nIters - 1;
     182             :             }
     183             :         }
     184             :         else
     185             :         {
     186             :             // Type to type conversion ...
     187         443 :             GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     188         443 :                             eBufType, nPixelSpace, std::max(1, nIters));
     189         443 :             if (nIters > 1)
     190             :             {
     191         216 :                 pabySrcData +=
     192         216 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     193         216 :                 pabyDstData +=
     194         216 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     195         216 :                 iSrcX += nSrcXInc * (nIters - 1);
     196         216 :                 nOuterLoopIters -= nIters - 1;
     197             :             }
     198             :         }
     199             :     }
     200             : 
     201             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     202     1063279 :     if (nOuterLoopIters == 0)
     203             :     {
     204      367430 :         const int nRasterXSize = poBand->GetXSize();
     205      367430 :         iSrcX =
     206      734860 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     207      367430 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     208      367430 :         pabyDstData += nPixelSpace;
     209      367430 :         if (iSrcX < nEndBlockX)
     210             :         {
     211      354840 :             goto no_reload_block;
     212             :         }
     213       12590 :         goto reload_block;
     214             :     }
     215      695849 :     return true;
     216             : }
     217             : 
     218             : template <class A, class B>
     219     2818770 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
     220             : {
     221     2818770 :     return a * b;
     222             : }
     223             : 
     224             : /************************************************************************/
     225             : /*                             IRasterIO()                              */
     226             : /*                                                                      */
     227             : /*      Default internal implementation of RasterIO() ... utilizes      */
     228             : /*      the Block access methods to satisfy the request.  This would    */
     229             : /*      normally only be overridden by formats with overviews.          */
     230             : /************************************************************************/
     231             : 
     232     6180730 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     233             :                                  int nXSize, int nYSize, void *pData,
     234             :                                  int nBufXSize, int nBufYSize,
     235             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     236             :                                  GSpacing nLineSpace,
     237             :                                  GDALRasterIOExtraArg *psExtraArg)
     238             : 
     239             : {
     240     6180730 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     241             :     {
     242           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     243             :                  "An error occurred while writing a dirty block "
     244             :                  "from GDALRasterBand::IRasterIO");
     245           0 :         CPLErr eErr = eFlushBlockErr;
     246           0 :         eFlushBlockErr = CE_None;
     247           0 :         return eErr;
     248             :     }
     249     6180730 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     250             :     {
     251           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     252           0 :         return CE_Failure;
     253             :     }
     254             : 
     255     6180730 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     256     6180730 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     257     6180730 :     GByte dummyBlock[2] = {0, 0};
     258     6180730 :     GByte *pabySrcBlock =
     259             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     260     6180730 :     GDALRasterBlock *poBlock = nullptr;
     261     6180730 :     const bool bUseIntegerRequestCoords =
     262     6545680 :         (!psExtraArg->bFloatingPointWindowValidity ||
     263      364948 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     264      340016 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     265             : 
     266             :     /* ==================================================================== */
     267             :     /*      A common case is the data requested with the destination        */
     268             :     /*      is packed, and the block width is the raster width.             */
     269             :     /* ==================================================================== */
     270     6088920 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     271     3234440 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     272    12269600 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     273             :     {
     274     3096640 :         CPLErr eErr = CE_None;
     275     3096640 :         int nLBlockY = -1;
     276             : 
     277     9751800 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     278             :         {
     279     6656240 :             const int iSrcY = iBufYOff + nYOff;
     280             : 
     281     6656240 :             if (iSrcY < nLBlockY * nBlockYSize ||
     282     6656240 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     283             :             {
     284     3365600 :                 nLBlockY = iSrcY / nBlockYSize;
     285     3365600 :                 bool bJustInitialize =
     286      297355 :                     eRWFlag == GF_Write && nXOff == 0 &&
     287     3720870 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     288       57921 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     289             : 
     290             :                 // Is this a partial tile at right and/or bottom edges of
     291             :                 // the raster, and that is going to be completely written?
     292             :                 // If so, do not load it from storage, but zero it so that
     293             :                 // the content outsize of the validity area is initialized.
     294     3365600 :                 bool bMemZeroBuffer = false;
     295      297355 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     296       24978 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     297     3663040 :                     nYOff + nYSize == GetYSize() &&
     298          90 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     299             :                 {
     300          90 :                     bJustInitialize = true;
     301          90 :                     bMemZeroBuffer = true;
     302             :                 }
     303             : 
     304     3365600 :                 if (poBlock)
     305      268957 :                     poBlock->DropLock();
     306             : 
     307     3365600 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     308     3365600 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     309     3365600 :                 if (poBlock == nullptr)
     310             :                 {
     311        1078 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     312             :                         nullptr)
     313             :                     {
     314           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     315             :                                  "GetBlockRef failed at X block offset %d, "
     316             :                                  "Y block offset %d%s",
     317             :                                  0, nLBlockY,
     318           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     319           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     320             :                                      : "");
     321             :                     }
     322        1078 :                     eErr = CE_Failure;
     323        1078 :                     break;
     324             :                 }
     325             : 
     326     3364520 :                 if (eRWFlag == GF_Write)
     327      297355 :                     poBlock->MarkDirty();
     328             : 
     329     3364520 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     330     3364520 :                 if (bMemZeroBuffer)
     331             :                 {
     332          90 :                     memset(pabySrcBlock, 0,
     333          90 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     334          90 :                                nBlockXSize * nBlockYSize);
     335             :                 }
     336             :             }
     337             : 
     338     6655160 :             const auto nSrcByteOffset =
     339     6655160 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     340     6655160 :                      nBlockXSize +
     341     6655160 :                  nXOff) *
     342     6655160 :                 nBandDataSize;
     343             : 
     344     6655160 :             if (eDataType == eBufType)
     345             :             {
     346     2991450 :                 if (eRWFlag == GF_Read)
     347     2518870 :                     memcpy(static_cast<GByte *>(pData) +
     348     2518870 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     349     2518870 :                            pabySrcBlock + nSrcByteOffset,
     350             :                            static_cast<size_t>(nLineSpace));
     351             :                 else
     352      472580 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     353      472580 :                            static_cast<GByte *>(pData) +
     354      472580 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     355             :                            static_cast<size_t>(nLineSpace));
     356             :             }
     357             :             else
     358             :             {
     359             :                 // Type to type conversion.
     360     3663710 :                 if (eRWFlag == GF_Read)
     361     3641640 :                     GDALCopyWords64(
     362     3641640 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     363             :                         static_cast<GByte *>(pData) +
     364     3641640 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     365             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     366             :                 else
     367       22065 :                     GDALCopyWords64(static_cast<GByte *>(pData) +
     368       22065 :                                         static_cast<GPtrDiff_t>(iBufYOff) *
     369             :                                             nLineSpace,
     370             :                                     eBufType, static_cast<int>(nPixelSpace),
     371       22065 :                                     pabySrcBlock + nSrcByteOffset, eDataType,
     372             :                                     nBandDataSize, nBufXSize);
     373             :             }
     374             : 
     375     6743170 :             if (psExtraArg->pfnProgress != nullptr &&
     376       88008 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     377             :                                          psExtraArg->pProgressData))
     378             :             {
     379           5 :                 eErr = CE_Failure;
     380           5 :                 break;
     381             :             }
     382             :         }
     383             : 
     384     3096640 :         if (poBlock)
     385     3095560 :             poBlock->DropLock();
     386             : 
     387     3096640 :         return eErr;
     388             :     }
     389             : 
     390             :     /* ==================================================================== */
     391             :     /*      Do we have overviews that would be appropriate to satisfy       */
     392             :     /*      this request?                                                   */
     393             :     /* ==================================================================== */
     394     3084090 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     395             :         eRWFlag == GF_Read)
     396             :     {
     397             :         GDALRasterIOExtraArg sExtraArg;
     398        2967 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     399             : 
     400             :         const int nOverview =
     401        2967 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     402             :                                           nBufXSize, nBufYSize, &sExtraArg);
     403        2967 :         if (nOverview >= 0)
     404             :         {
     405        2892 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     406        2892 :             if (poOverviewBand == nullptr)
     407        2892 :                 return CE_Failure;
     408             : 
     409        2892 :             return poOverviewBand->RasterIO(
     410             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     411        2892 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     412             :         }
     413             :     }
     414             : 
     415      891713 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     416           6 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     417     3972910 :         nLineSpace == nPixelSpace * nBufXSize &&
     418           6 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     419             :     {
     420           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     421           0 :         return CE_None;
     422             :     }
     423             : 
     424             :     /* ==================================================================== */
     425             :     /*      The second case when we don't need subsample data but likely    */
     426             :     /*      need data type conversion.                                      */
     427             :     /* ==================================================================== */
     428     3081200 :     if (  // nPixelSpace == nBufDataSize &&
     429     3081200 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     430             :     {
     431             : #if DEBUG_VERBOSE
     432             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     433             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     434             : #endif
     435             : 
     436             :         /* --------------------------------------------------------------------
     437             :          */
     438             :         /*      Loop over buffer computing source locations. */
     439             :         /* --------------------------------------------------------------------
     440             :          */
     441             :         // Calculate starting values out of loop
     442     2503280 :         const int nLBlockXStart = nXOff / nBlockXSize;
     443     2503280 :         const int nXSpanEnd = nBufXSize + nXOff;
     444             : 
     445     2503280 :         int nYInc = 0;
     446     5047340 :         for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
     447     2544060 :              iBufYOff += nYInc, iSrcY += nYInc)
     448             :         {
     449     2544130 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     450             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     451     2544130 :             int nLBlockY = iSrcY / nBlockYSize;
     452     2544130 :             int nLBlockX = nLBlockXStart;
     453     2544130 :             int iSrcX = nXOff;
     454     5362830 :             while (iSrcX < nXSpanEnd)
     455             :             {
     456     2818770 :                 int nXSpan = nLBlockX * nBlockXSize;
     457     2818770 :                 if (nXSpan < INT_MAX - nBlockXSize)
     458     2818770 :                     nXSpan += nBlockXSize;
     459             :                 else
     460           0 :                     nXSpan = INT_MAX;
     461     2818770 :                 const int nXRight = nXSpan;
     462     2818770 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     463             : 
     464             :                 const size_t nXSpanSize =
     465     2818770 :                     CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
     466             : 
     467     2818770 :                 bool bJustInitialize =
     468     2042970 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     469       38035 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     470     4888110 :                     nXOff <= nLBlockX * nBlockXSize &&
     471       26364 :                     nXOff + nXSize >= nXRight;
     472             : 
     473             :                 // Is this a partial tile at right and/or bottom edges of
     474             :                 // the raster, and that is going to be completely written?
     475             :                 // If so, do not load it from storage, but zero it so that
     476             :                 // the content outsize of the validity area is initialized.
     477     2818770 :                 bool bMemZeroBuffer = false;
     478     2042970 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     479     2017850 :                     nXOff <= nLBlockX * nBlockXSize &&
     480     2016190 :                     nYOff <= nLBlockY * nBlockYSize &&
     481       12145 :                     (nXOff + nXSize >= nXRight ||
     482             :                      // cppcheck-suppress knownConditionTrueFalse
     483     4864460 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     484       11965 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     485       10743 :                      (nYOff + nYSize == GetYSize() &&
     486        1951 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     487             :                 {
     488        3173 :                     bJustInitialize = true;
     489        3173 :                     bMemZeroBuffer = true;
     490             :                 }
     491             : 
     492             :                 /* --------------------------------------------------------------------
     493             :                  */
     494             :                 /*      Ensure we have the appropriate block loaded. */
     495             :                 /* --------------------------------------------------------------------
     496             :                  */
     497     2818770 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     498     2818770 :                 poBlock =
     499     2818770 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     500     2818770 :                 if (!poBlock)
     501             :                 {
     502          73 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     503             :                         nullptr)
     504             :                     {
     505           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     506             :                                  "GetBlockRef failed at X block offset %d, "
     507             :                                  "Y block offset %d%s",
     508             :                                  nLBlockX, nLBlockY,
     509           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     510           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     511             :                                      : "");
     512             :                     }
     513          73 :                     return (CE_Failure);
     514             :                 }
     515             : 
     516     2818700 :                 if (eRWFlag == GF_Write)
     517     2042970 :                     poBlock->MarkDirty();
     518             : 
     519     2818700 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     520     2818700 :                 if (bMemZeroBuffer)
     521             :                 {
     522        3173 :                     memset(pabySrcBlock, 0,
     523        3173 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     524        3173 :                                nBlockXSize * nBlockYSize);
     525             :                 }
     526             :                 /* --------------------------------------------------------------------
     527             :                  */
     528             :                 /*      Copy over this chunk of data. */
     529             :                 /* --------------------------------------------------------------------
     530             :                  */
     531     2818700 :                 GPtrDiff_t iSrcOffset =
     532     2818700 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     533     2818700 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     534     2818700 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     535     2818700 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     536     2818700 :                          nBlockXSize) *
     537     2818700 :                     nBandDataSize;
     538             :                 // Fill up as many rows as possible for the loaded block.
     539     5637400 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     540     2818700 :                                           nBufYSize - iBufYOff);
     541    60991500 :                 for (int k = 0; k < kmax; k++)
     542             :                 {
     543    58172800 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     544             :                     {
     545    53770900 :                         if (eRWFlag == GF_Read)
     546    49332800 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     547    49332800 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     548    49332800 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     549             :                         else
     550     4438130 :                             memcpy(pabySrcBlock + iSrcOffset,
     551     4438130 :                                    static_cast<GByte *>(pData) + iBufOffset +
     552     4438130 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     553             :                                    nXSpanSize);
     554             :                     }
     555             :                     else
     556             :                     {
     557             :                         /* type to type conversion */
     558     4401910 :                         if (eRWFlag == GF_Read)
     559     4251700 :                             GDALCopyWords64(
     560     4251700 :                                 pabySrcBlock + iSrcOffset, eDataType,
     561             :                                 nBandDataSize,
     562     4251700 :                                 static_cast<GByte *>(pData) + iBufOffset +
     563     4251700 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     564             :                                 eBufType, static_cast<int>(nPixelSpace),
     565             :                                 nXSpan);
     566             :                         else
     567      150209 :                             GDALCopyWords64(
     568      150209 :                                 static_cast<GByte *>(pData) + iBufOffset +
     569      150209 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     570             :                                 eBufType, static_cast<int>(nPixelSpace),
     571      150209 :                                 pabySrcBlock + iSrcOffset, eDataType,
     572             :                                 nBandDataSize, nXSpan);
     573             :                     }
     574             : 
     575    58172800 :                     iSrcOffset +=
     576    58172800 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     577             :                 }
     578             : 
     579             :                 iBufOffset =
     580     2818700 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     581     2818700 :                 nLBlockX++;
     582     2818700 :                 iSrcX += nXSpan;
     583             : 
     584     2818700 :                 poBlock->DropLock();
     585     2818700 :                 poBlock = nullptr;
     586             :             }
     587             : 
     588             :             /* Compute the increment to go on a block boundary */
     589     2544060 :             nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     590             : 
     591     2545940 :             if (psExtraArg->pfnProgress != nullptr &&
     592        1884 :                 !psExtraArg->pfnProgress(
     593     2545940 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     594             :                     psExtraArg->pProgressData))
     595             :             {
     596           0 :                 return CE_Failure;
     597             :             }
     598             :         }
     599             : 
     600     2503210 :         return CE_None;
     601             :     }
     602             : 
     603             :     /* ==================================================================== */
     604             :     /*      Loop reading required source blocks to satisfy output           */
     605             :     /*      request.  This is the most general implementation.              */
     606             :     /* ==================================================================== */
     607             : 
     608      577913 :     double dfXOff = nXOff;
     609      577913 :     double dfYOff = nYOff;
     610      577913 :     double dfXSize = nXSize;
     611      577913 :     double dfYSize = nYSize;
     612      577913 :     if (psExtraArg->bFloatingPointWindowValidity)
     613             :     {
     614      242956 :         dfXOff = psExtraArg->dfXOff;
     615      242956 :         dfYOff = psExtraArg->dfYOff;
     616      242956 :         dfXSize = psExtraArg->dfXSize;
     617      242956 :         dfYSize = psExtraArg->dfYSize;
     618             :     }
     619             : 
     620             :     /* -------------------------------------------------------------------- */
     621             :     /*      Compute stepping increment.                                     */
     622             :     /* -------------------------------------------------------------------- */
     623      577913 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     624      577913 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     625      577913 :     CPLErr eErr = CE_None;
     626             : 
     627      577913 :     if (eRWFlag == GF_Write)
     628             :     {
     629             :         /* --------------------------------------------------------------------
     630             :          */
     631             :         /*    Write case */
     632             :         /*    Loop over raster window computing source locations in the buffer.
     633             :          */
     634             :         /* --------------------------------------------------------------------
     635             :          */
     636      166655 :         GByte *pabyDstBlock = nullptr;
     637      166655 :         int nLBlockX = -1;
     638      166655 :         int nLBlockY = -1;
     639             : 
     640     1260010 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     641             :         {
     642     1093360 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     643             : 
     644    12384200 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     645             :             {
     646    11290800 :                 const int iBufXOff =
     647    11290800 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     648    11290800 :                 GPtrDiff_t iBufOffset =
     649    11290800 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     650             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     651    11290800 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     652             : 
     653             :                 // FIXME: this code likely doesn't work if the dirty block gets
     654             :                 // flushed to disk before being completely written.
     655             :                 // In the meantime, bJustInitialize should probably be set to
     656             :                 // FALSE even if it is not ideal performance wise, and for
     657             :                 // lossy compression.
     658             : 
     659             :                 /* --------------------------------------------------------------------
     660             :                  */
     661             :                 /*      Ensure we have the appropriate block loaded. */
     662             :                 /* --------------------------------------------------------------------
     663             :                  */
     664    11290800 :                 if (iDstX < nLBlockX * nBlockXSize ||
     665    11041500 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     666    10584800 :                     iDstY < nLBlockY * nBlockYSize ||
     667    10584800 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     668             :                 {
     669      738702 :                     nLBlockX = iDstX / nBlockXSize;
     670      738702 :                     nLBlockY = iDstY / nBlockYSize;
     671             : 
     672      738702 :                     const bool bJustInitialize =
     673     1065990 :                         nYOff <= nLBlockY * nBlockYSize &&
     674      327291 :                         nYOff + nYSize - nBlockYSize >=
     675      327291 :                             nLBlockY * nBlockYSize &&
     676     1116320 :                         nXOff <= nLBlockX * nBlockXSize &&
     677       50325 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     678             :                     /*bool bMemZeroBuffer = FALSE;
     679             :                     if( !bJustInitialize &&
     680             :                         nXOff <= nLBlockX * nBlockXSize &&
     681             :                         nYOff <= nLBlockY * nBlockYSize &&
     682             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     683             :                          (nXOff + nXSize == GetXSize() &&
     684             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     685             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     686             :                          (nYOff + nYSize == GetYSize() &&
     687             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     688             :                     {
     689             :                         bJustInitialize = TRUE;
     690             :                         bMemZeroBuffer = TRUE;
     691             :                     }*/
     692      738702 :                     if (poBlock != nullptr)
     693      572047 :                         poBlock->DropLock();
     694             : 
     695      738702 :                     poBlock =
     696      738702 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     697      738702 :                     if (poBlock == nullptr)
     698             :                     {
     699           0 :                         return (CE_Failure);
     700             :                     }
     701             : 
     702      738702 :                     poBlock->MarkDirty();
     703             : 
     704      738702 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     705             :                     /*if( bMemZeroBuffer )
     706             :                     {
     707             :                         memset(pabyDstBlock, 0,
     708             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     709             :                     * nBlockYSize);
     710             :                     }*/
     711             :                 }
     712             : 
     713             :                 // To make Coverity happy. Should not happen by design.
     714    11290800 :                 if (pabyDstBlock == nullptr)
     715             :                 {
     716           0 :                     CPLAssert(false);
     717             :                     eErr = CE_Failure;
     718             :                     break;
     719             :                 }
     720             : 
     721             :                 /* --------------------------------------------------------------------
     722             :                  */
     723             :                 /*      Copy over this pixel of data. */
     724             :                 /* --------------------------------------------------------------------
     725             :                  */
     726    11290800 :                 GPtrDiff_t iDstOffset =
     727    11290800 :                     (static_cast<GPtrDiff_t>(iDstX) -
     728    11290800 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     729    11290800 :                      (static_cast<GPtrDiff_t>(iDstY) -
     730    11290800 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     731    11290800 :                          nBlockXSize) *
     732    11290800 :                     nBandDataSize;
     733             : 
     734    11290800 :                 if (eDataType == eBufType)
     735             :                 {
     736    11287700 :                     memcpy(pabyDstBlock + iDstOffset,
     737    11287700 :                            static_cast<GByte *>(pData) + iBufOffset,
     738             :                            nBandDataSize);
     739             :                 }
     740             :                 else
     741             :                 {
     742             :                     /* type to type conversion ... ouch, this is expensive way
     743             :                     of handling single words */
     744        3096 :                     GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
     745        3096 :                                     eBufType, 0, pabyDstBlock + iDstOffset,
     746             :                                     eDataType, 0, 1);
     747             :                 }
     748             :             }
     749             : 
     750     1093360 :             if (psExtraArg->pfnProgress != nullptr &&
     751           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     752             :                                          psExtraArg->pProgressData))
     753             :             {
     754           0 :                 eErr = CE_Failure;
     755           0 :                 break;
     756             :             }
     757             :         }
     758             :     }
     759             :     else
     760             :     {
     761      411258 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     762             :         {
     763       42075 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     764       13559 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     765       13506 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     766       28563 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     767        3224 :                 GetColorTable() != nullptr)
     768             :             {
     769           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     770             :                          "Resampling method not supported on paletted band. "
     771             :                          "Falling back to nearest neighbour");
     772             :             }
     773       14261 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     774           3 :                      GDALDataTypeIsComplex(eDataType))
     775             :             {
     776           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     777             :                          "Resampling method not supported on complex data type "
     778             :                          "band. Falling back to nearest neighbour");
     779             :             }
     780             :             else
     781             :             {
     782       14258 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     783             :                                          pData, nBufXSize, nBufYSize, eBufType,
     784       14258 :                                          nPixelSpace, nLineSpace, psExtraArg);
     785             :             }
     786             :         }
     787             : 
     788      397000 :         int nLimitBlockY = 0;
     789      397000 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     790      397000 :         int nStartBlockX = -nBlockXSize;
     791      397000 :         constexpr double EPS = 1e-10;
     792      397000 :         int nLBlockY = -1;
     793      397000 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     794      397000 :         const bool bIntegerXFactor =
     795      372767 :             bUseIntegerRequestCoords &&
     796      670836 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     797      273836 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     798             : 
     799             :         /* --------------------------------------------------------------------
     800             :          */
     801             :         /*      Read case */
     802             :         /*      Loop over buffer computing source locations. */
     803             :         /* --------------------------------------------------------------------
     804             :          */
     805     2367100 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     806             :         {
     807             :             // Add small epsilon to avoid some numeric precision issues.
     808     1970110 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     809     1970110 :             const int iSrcY = static_cast<int>(std::min(
     810     1970110 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     811             : 
     812     1970110 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     813             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     814             : 
     815     1970110 :             if (iSrcY >= nLimitBlockY)
     816             :             {
     817      438018 :                 nLBlockY = iSrcY / nBlockYSize;
     818      438018 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     819      438018 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     820      438018 :                     nLimitBlockY += nBlockYSize;
     821             :                 else
     822           0 :                     nLimitBlockY = INT_MAX;
     823             :                 // Make sure a new block is loaded.
     824      438018 :                 nStartBlockX = -nBlockXSize;
     825             :             }
     826     1532090 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     827             :             {
     828             :                 // Make sure a new block is loaded.
     829      437363 :                 nStartBlockX = -nBlockXSize;
     830             :             }
     831             : 
     832     1970110 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     833     1970110 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     834             : 
     835     1970110 :             if (bIntegerXFactor)
     836             :             {
     837      695850 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     838      695850 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     839      695850 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     840      695850 :                 bool bRet = false;
     841      695850 :                 if (bByteCopy)
     842             :                 {
     843      585842 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     844             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     845             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
     846             :                         GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
     847             :                         nLBlockY);
     848             :                 }
     849      110008 :                 else if (eDataType == eBufType)
     850             :                 {
     851      109783 :                     switch (nBandDataSize)
     852             :                     {
     853      109630 :                         case 2:
     854      109630 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     855             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     856             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     857             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     858             :                                 nBlockXSize, poBlock, nLBlockY);
     859      109630 :                             break;
     860          55 :                         case 4:
     861          55 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     862             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     863             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     864             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     865             :                                 nBlockXSize, poBlock, nLBlockY);
     866          55 :                             break;
     867          96 :                         case 8:
     868          96 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     869             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     870             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     871             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     872             :                                 nBlockXSize, poBlock, nLBlockY);
     873          96 :                             break;
     874           2 :                         case 16:
     875           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     876             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     877             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     878             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     879             :                                 nBlockXSize, poBlock, nLBlockY);
     880           2 :                             break;
     881           0 :                         default:
     882           0 :                             CPLAssert(false);
     883             :                             break;
     884             :                     }
     885             :                 }
     886             :                 else
     887             :                 {
     888         225 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     889             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     890             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     891             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     892             :                 }
     893      695850 :                 if (!bRet)
     894           1 :                     eErr = CE_Failure;
     895             :             }
     896             :             else
     897             :             {
     898     1274260 :                 double dfSrcX = dfSrcXStart;
     899   503811000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     900   502537000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     901             :                 {
     902             :                     // TODO?: try to avoid the clamping for most iterations
     903             :                     const int iSrcX = static_cast<int>(
     904  1005070000 :                         std::min(std::max(0.0, dfSrcX),
     905   502537000 :                                  static_cast<double>(nRasterXSize - 1)));
     906             : 
     907             :                     /* --------------------------------------------------------------------
     908             :                      */
     909             :                     /*      Ensure we have the appropriate block loaded. */
     910             :                     /* --------------------------------------------------------------------
     911             :                      */
     912   502537000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     913             :                     {
     914     1697820 :                         const int nLBlockX = iSrcX / nBlockXSize;
     915     1697820 :                         nStartBlockX = nLBlockX * nBlockXSize;
     916             : 
     917     1697820 :                         if (poBlock != nullptr)
     918     1574650 :                             poBlock->DropLock();
     919             : 
     920     1697820 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     921     1697820 :                         if (poBlock == nullptr)
     922             :                         {
     923           9 :                             eErr = CE_Failure;
     924           9 :                             break;
     925             :                         }
     926             : 
     927             :                         pabySrcBlock =
     928     1697810 :                             static_cast<GByte *>(poBlock->GetDataRef());
     929             :                     }
     930   502537000 :                     const GPtrDiff_t nDiffX =
     931   502537000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     932             : 
     933             :                     /* --------------------------------------------------------------------
     934             :                      */
     935             :                     /*      Copy over this pixel of data. */
     936             :                     /* --------------------------------------------------------------------
     937             :                      */
     938             : 
     939   502537000 :                     if (bByteCopy)
     940             :                     {
     941   442592000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     942   442592000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     943   442592000 :                             pabySrcBlock[iSrcOffset];
     944             :                     }
     945    59944700 :                     else if (eDataType == eBufType)
     946             :                     {
     947    50322800 :                         GPtrDiff_t iSrcOffset =
     948    50322800 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     949    50322800 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     950    50322800 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     951             :                     }
     952             :                     else
     953             :                     {
     954             :                         // Type to type conversion ...
     955     9621890 :                         GPtrDiff_t iSrcOffset =
     956     9621890 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     957     9621890 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
     958             :                                         static_cast<GByte *>(pData) +
     959     9621890 :                                             iBufOffset,
     960             :                                         eBufType, 0, 1);
     961             :                     }
     962             : 
     963   502537000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     964             :                 }
     965             :             }
     966     1970110 :             if (eErr == CE_Failure)
     967          11 :                 break;
     968             : 
     969     2191530 :             if (psExtraArg->pfnProgress != nullptr &&
     970      221434 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     971             :                                          psExtraArg->pProgressData))
     972             :             {
     973           1 :                 eErr = CE_Failure;
     974           1 :                 break;
     975             :             }
     976             :         }
     977             :     }
     978             : 
     979      563655 :     if (poBlock != nullptr)
     980      563645 :         poBlock->DropLock();
     981             : 
     982      563655 :     return eErr;
     983             : }
     984             : 
     985             : /************************************************************************/
     986             : /*                      GDALRasterIOTransformer()                       */
     987             : /************************************************************************/
     988             : 
     989             : struct GDALRasterIOTransformerStruct
     990             : {
     991             :     double dfXOff;
     992             :     double dfYOff;
     993             :     double dfXRatioDstToSrc;
     994             :     double dfYRatioDstToSrc;
     995             : };
     996             : 
     997        6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
     998             :                                    int nPointCount, double *x, double *y,
     999             :                                    double * /* z */, int *panSuccess)
    1000             : {
    1001        6897 :     GDALRasterIOTransformerStruct *psParams =
    1002             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
    1003        6897 :     if (bDstToSrc)
    1004             :     {
    1005      311993 :         for (int i = 0; i < nPointCount; i++)
    1006             :         {
    1007      305684 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
    1008      305684 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
    1009      305684 :             panSuccess[i] = TRUE;
    1010             :         }
    1011             :     }
    1012             :     else
    1013             :     {
    1014        1176 :         for (int i = 0; i < nPointCount; i++)
    1015             :         {
    1016         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
    1017         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
    1018         588 :             panSuccess[i] = TRUE;
    1019             :         }
    1020             :     }
    1021        6897 :     return TRUE;
    1022             : }
    1023             : 
    1024             : /************************************************************************/
    1025             : /*                         RasterIOResampled()                          */
    1026             : /************************************************************************/
    1027             : 
    1028             : //! @cond Doxygen_Suppress
    1029       14258 : CPLErr GDALRasterBand::RasterIOResampled(
    1030             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1031             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1032             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1033             : {
    1034             :     // Determine if we use warping resampling or overview resampling
    1035             :     const bool bUseWarp =
    1036       14258 :         (GDALDataTypeIsComplex(eDataType) &&
    1037       14417 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1038         159 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1039             : 
    1040       14258 :     double dfXOff = nXOff;
    1041       14258 :     double dfYOff = nYOff;
    1042       14258 :     double dfXSize = nXSize;
    1043       14258 :     double dfYSize = nYSize;
    1044       14258 :     if (psExtraArg->bFloatingPointWindowValidity)
    1045             :     {
    1046       13512 :         dfXOff = psExtraArg->dfXOff;
    1047       13512 :         dfYOff = psExtraArg->dfYOff;
    1048       13512 :         dfXSize = psExtraArg->dfXSize;
    1049       13512 :         dfYSize = psExtraArg->dfYSize;
    1050             :     }
    1051             : 
    1052       14258 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1053       14258 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1054             : 
    1055             :     // Determine the coordinates in the "virtual" output raster to see
    1056             :     // if there are not integers, in which case we will use them as a shift
    1057             :     // so that subwindow extracts give the exact same results as entire raster
    1058             :     // scaling.
    1059       14258 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1060       14258 :     bool bHasXOffVirtual = false;
    1061       14258 :     int nDestXOffVirtual = 0;
    1062       14258 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1063             :     {
    1064       13930 :         bHasXOffVirtual = true;
    1065       13930 :         dfXOff = nXOff;
    1066       13930 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1067             :     }
    1068             : 
    1069       14258 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1070       14258 :     bool bHasYOffVirtual = false;
    1071       14258 :     int nDestYOffVirtual = 0;
    1072       14258 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1073             :     {
    1074       13926 :         bHasYOffVirtual = true;
    1075       13926 :         dfYOff = nYOff;
    1076       13926 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1077             :     }
    1078             : 
    1079             :     // Create a MEM dataset that wraps the output buffer.
    1080             :     GDALDataset *poMEMDS;
    1081       14258 :     void *pTempBuffer = nullptr;
    1082       14258 :     GSpacing nPSMem = nPixelSpace;
    1083       14258 :     GSpacing nLSMem = nLineSpace;
    1084       14258 :     void *pDataMem = pData;
    1085       14258 :     GDALDataType eDTMem = eBufType;
    1086       14258 :     if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
    1087             :     {
    1088           4 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1089           4 :         nLSMem = nPSMem * nBufXSize;
    1090             :         pTempBuffer =
    1091           4 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1092           4 :         if (pTempBuffer == nullptr)
    1093           0 :             return CE_Failure;
    1094           4 :         pDataMem = pTempBuffer;
    1095           4 :         eDTMem = eDataType;
    1096             :     }
    1097             : 
    1098             :     poMEMDS =
    1099       14258 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1100             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1101       14258 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1102       14258 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1103       14258 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1104             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1105       14258 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1106             : 
    1107       14258 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1108       14258 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1109       14258 :     if (pszNBITS)
    1110           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1111           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1112             : 
    1113       14258 :     CPLErr eErr = CE_None;
    1114             : 
    1115             :     // Do the resampling.
    1116       14258 :     if (bUseWarp)
    1117             :     {
    1118         149 :         int bHasNoData = FALSE;
    1119         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1120             : 
    1121         149 :         VRTDatasetH hVRTDS = nullptr;
    1122         149 :         GDALRasterBandH hVRTBand = nullptr;
    1123         149 :         if (GetDataset() == nullptr)
    1124             :         {
    1125             :             /* Create VRT dataset that wraps the whole dataset */
    1126           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1127           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1128           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1129           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1130             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1131             :                                VRT_NODATA_UNSET);
    1132             : 
    1133             :             /* Add a mask band if needed */
    1134           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1135             :             {
    1136           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1137             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1138             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1139             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1140           0 :                             ->GetMaskBand());
    1141           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1142           0 :                                                  nRasterYSize, 0, 0,
    1143           0 :                                                  nRasterXSize, nRasterYSize);
    1144             :             }
    1145             :         }
    1146             : 
    1147         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1148         149 :         switch (psExtraArg->eResampleAlg)
    1149             :         {
    1150           0 :             case GRIORA_NearestNeighbour:
    1151           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1152           0 :                 break;
    1153         147 :             case GRIORA_Bilinear:
    1154         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1155         147 :                 break;
    1156           0 :             case GRIORA_Cubic:
    1157           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1158           0 :                 break;
    1159           0 :             case GRIORA_CubicSpline:
    1160           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1161           0 :                 break;
    1162           0 :             case GRIORA_Lanczos:
    1163           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1164           0 :                 break;
    1165           0 :             case GRIORA_Average:
    1166           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1167           0 :                 break;
    1168           2 :             case GRIORA_RMS:
    1169           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1170           2 :                 break;
    1171           0 :             case GRIORA_Mode:
    1172           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1173           0 :                 break;
    1174           0 :             default:
    1175           0 :                 CPLAssert(false);
    1176             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1177             :                 break;
    1178             :         }
    1179         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1180         149 :         psWarpOptions->hDstDS = poMEMDS;
    1181         149 :         psWarpOptions->nBandCount = 1;
    1182         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1183         149 :         int nDstBandNumber = 1;
    1184         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1185         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1186         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1187         149 :                                          ? psExtraArg->pfnProgress
    1188             :                                          : GDALDummyProgress;
    1189         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1190         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1191         149 :         if (bHasNoData)
    1192             :         {
    1193           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1194             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1195           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1196             :             {
    1197           0 :                 psWarpOptions->padfSrcNoDataReal =
    1198           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1199           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1200             :             }
    1201             : 
    1202           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1203             :             {
    1204           0 :                 psWarpOptions->padfDstNoDataReal =
    1205           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1206           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1207             :             }
    1208             :         }
    1209             : 
    1210             :         GDALRasterIOTransformerStruct sTransformer;
    1211         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1212         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1213         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1214         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1215         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1216             : 
    1217             :         GDALWarpOperationH hWarpOperation =
    1218         149 :             GDALCreateWarpOperation(psWarpOptions);
    1219         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1220             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1221         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1222             : 
    1223         149 :         psWarpOptions->panSrcBands = nullptr;
    1224         149 :         psWarpOptions->panDstBands = nullptr;
    1225         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1226             : 
    1227         149 :         if (hVRTDS)
    1228           0 :             GDALClose(hVRTDS);
    1229             :     }
    1230             :     else
    1231             :     {
    1232             :         const char *pszResampling =
    1233       14109 :             GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
    1234       14109 :         int nKernelRadius = 0;
    1235             :         GDALResampleFunction pfnResampleFunc =
    1236       14109 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1237       14109 :         CPLAssert(pfnResampleFunc);
    1238             :         GDALDataType eWrkDataType =
    1239       14109 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1240       14109 :         int nHasNoData = 0;
    1241       14109 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1242       14109 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1243       14109 :         if (!bHasNoData)
    1244       13977 :             dfNoDataValue = 0.0;
    1245             : 
    1246       14109 :         int nDstBlockXSize = nBufXSize;
    1247       14109 :         int nDstBlockYSize = nBufYSize;
    1248       14109 :         int nFullResXChunk = 0;
    1249       14109 :         int nFullResYChunk = 0;
    1250             :         while (true)
    1251             :         {
    1252       14120 :             nFullResXChunk =
    1253       14120 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1254       14120 :             nFullResYChunk =
    1255       14120 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1256       14120 :             if (nFullResXChunk > nRasterXSize)
    1257        4777 :                 nFullResXChunk = nRasterXSize;
    1258       14120 :             if (nFullResYChunk > nRasterYSize)
    1259         594 :                 nFullResYChunk = nRasterYSize;
    1260       14120 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1261       14062 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1262             :                  1024 * 1024))
    1263             :                 break;
    1264             :             // When operating on the full width of a raster whose block width is
    1265             :             // the raster width, prefer doing chunks in height.
    1266          11 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1267             :                 nDstBlockYSize > 1)
    1268           0 :                 nDstBlockYSize /= 2;
    1269             :             /* Otherwise cut the maximal dimension */
    1270          11 :             else if (nDstBlockXSize > 1 &&
    1271           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1272          11 :                 nDstBlockXSize /= 2;
    1273             :             else
    1274           0 :                 nDstBlockYSize /= 2;
    1275             :         }
    1276             : 
    1277       14109 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1278       14109 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1279       14109 :         if (nOvrXFactor == 0)
    1280        2029 :             nOvrXFactor = 1;
    1281       14109 :         if (nOvrYFactor == 0)
    1282        2028 :             nOvrYFactor = 1;
    1283       14109 :         int nFullResXSizeQueried =
    1284       14109 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1285       14109 :         int nFullResYSizeQueried =
    1286       14109 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1287             : 
    1288       14109 :         if (nFullResXSizeQueried > nRasterXSize)
    1289        2734 :             nFullResXSizeQueried = nRasterXSize;
    1290       14109 :         if (nFullResYSizeQueried > nRasterYSize)
    1291         332 :             nFullResYSizeQueried = nRasterYSize;
    1292             : 
    1293             :         void *pChunk =
    1294       14109 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1295             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1296       14109 :         GByte *pabyChunkNoDataMask = nullptr;
    1297             : 
    1298       14109 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1299       14109 :         int l_nMaskFlags = GetMaskFlags();
    1300             : 
    1301       14109 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1302       14109 :         if (bUseNoDataMask)
    1303             :         {
    1304        7525 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1305             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1306             :         }
    1307       14109 :         if (pChunk == nullptr ||
    1308        7525 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1309             :         {
    1310           0 :             GDALClose(poMEMDS);
    1311           0 :             CPLFree(pChunk);
    1312           0 :             CPLFree(pabyChunkNoDataMask);
    1313           0 :             VSIFree(pTempBuffer);
    1314           0 :             return CE_Failure;
    1315             :         }
    1316             : 
    1317       14109 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1318       14109 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1319       14109 :         int nBlocksDone = 0;
    1320             : 
    1321             :         int nDstYOff;
    1322       28218 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1323       14109 :              nDstYOff += nDstBlockYSize)
    1324             :         {
    1325             :             int nDstYCount;
    1326       14109 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1327       14109 :                 nDstYCount = nDstBlockYSize;
    1328             :             else
    1329           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1330             : 
    1331       14109 :             int nChunkYOff =
    1332       14109 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1333       14109 :             int nChunkYOff2 = nYOff + 1 +
    1334       14109 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1335             :                                                     dfYRatioDstToSrc));
    1336       14109 :             if (nChunkYOff2 > nRasterYSize)
    1337         782 :                 nChunkYOff2 = nRasterYSize;
    1338       14109 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1339       14109 :             CPLAssert(nYCount <= nFullResYChunk);
    1340             : 
    1341       14109 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1342       14109 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1343       14109 :             if (nChunkYOffQueried < 0)
    1344             :             {
    1345         491 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1346         491 :                 nChunkYOffQueried = 0;
    1347             :             }
    1348       14109 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1349         594 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1350       14109 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1351             : 
    1352       14109 :             int nDstXOff = 0;
    1353       28218 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1354       14109 :                  nDstXOff += nDstBlockXSize)
    1355             :             {
    1356       14109 :                 int nDstXCount = 0;
    1357       14109 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1358       14109 :                     nDstXCount = nDstBlockXSize;
    1359             :                 else
    1360           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1361             : 
    1362       14109 :                 int nChunkXOff =
    1363       14109 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1364       14109 :                 int nChunkXOff2 =
    1365       14109 :                     nXOff + 1 +
    1366       14109 :                     static_cast<int>(
    1367       14109 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1368       14109 :                 if (nChunkXOff2 > nRasterXSize)
    1369        8802 :                     nChunkXOff2 = nRasterXSize;
    1370       14109 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1371       14109 :                 CPLAssert(nXCount <= nFullResXChunk);
    1372             : 
    1373       14109 :                 int nChunkXOffQueried =
    1374       14109 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1375       14109 :                 int nChunkXSizeQueried =
    1376       14109 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1377       14109 :                 if (nChunkXOffQueried < 0)
    1378             :                 {
    1379        2795 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1380        2795 :                     nChunkXOffQueried = 0;
    1381             :                 }
    1382       14109 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1383        2781 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1384       14109 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1385             : 
    1386             :                 // Read the source buffers.
    1387       14109 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1388             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1389             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1390             :                                 eWrkDataType, 0, 0, nullptr);
    1391             : 
    1392       14109 :                 bool bSkipResample = false;
    1393       14109 :                 bool bNoDataMaskFullyOpaque = false;
    1394       14109 :                 if (eErr == CE_None && bUseNoDataMask)
    1395             :                 {
    1396        7525 :                     eErr = poMaskBand->RasterIO(
    1397             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1398             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1399             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1400             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1401             : 
    1402             :                     /* Optimizations if mask if fully opaque or transparent */
    1403        7525 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1404        7525 :                     GByte bVal = pabyChunkNoDataMask[0];
    1405        7525 :                     int i = 1;
    1406    15237000 :                     for (; i < nPixels; i++)
    1407             :                     {
    1408    15230700 :                         if (pabyChunkNoDataMask[i] != bVal)
    1409        1168 :                             break;
    1410             :                     }
    1411        7525 :                     if (i == nPixels)
    1412             :                     {
    1413        6357 :                         if (bVal == 0)
    1414             :                         {
    1415       12094 :                             for (int j = 0; j < nDstYCount; j++)
    1416             :                             {
    1417        6377 :                                 GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
    1418             :                                                 static_cast<GByte *>(pDataMem) +
    1419        6377 :                                                     nLSMem * (j + nDstYOff) +
    1420        6377 :                                                     nDstXOff * nPSMem,
    1421             :                                                 eDTMem,
    1422             :                                                 static_cast<int>(nPSMem),
    1423             :                                                 nDstXCount);
    1424             :                             }
    1425        5717 :                             bSkipResample = true;
    1426             :                         }
    1427             :                         else
    1428             :                         {
    1429         640 :                             bNoDataMaskFullyOpaque = true;
    1430             :                         }
    1431             :                     }
    1432             :                 }
    1433             : 
    1434       14109 :                 if (!bSkipResample && eErr == CE_None)
    1435             :                 {
    1436        8389 :                     const bool bPropagateNoData = false;
    1437        8389 :                     void *pDstBuffer = nullptr;
    1438        8389 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1439             :                     GDALRasterBand *poMEMBand =
    1440        8389 :                         GDALRasterBand::FromHandle(hMEMBand);
    1441        8389 :                     GDALOverviewResampleArgs args;
    1442        8389 :                     args.eSrcDataType = eDataType;
    1443        8389 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1444        8389 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1445        8389 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1446        8389 :                     args.nOvrNBITS = nNBITS;
    1447        8389 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1448        8389 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1449        8389 :                     args.dfSrcXDelta =
    1450        8389 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1451        8389 :                     args.dfSrcYDelta =
    1452        8389 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1453        8389 :                     args.eWrkDataType = eWrkDataType;
    1454        8389 :                     args.pabyChunkNodataMask =
    1455        8389 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1456        8389 :                     args.nChunkXOff =
    1457        8389 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1458        8389 :                     args.nChunkXSize = nChunkXSizeQueried;
    1459        8389 :                     args.nChunkYOff =
    1460        8389 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1461        8389 :                     args.nChunkYSize = nChunkYSizeQueried;
    1462        8389 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1463        8389 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1464        8389 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1465        8389 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1466        8389 :                     args.pszResampling = pszResampling;
    1467        8389 :                     args.bHasNoData = bHasNoData;
    1468        8389 :                     args.dfNoDataValue = dfNoDataValue;
    1469        8389 :                     args.poColorTable = GetColorTable();
    1470        8389 :                     args.bPropagateNoData = bPropagateNoData;
    1471        8389 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1472             :                                            &eDstBufferDataType);
    1473        8389 :                     if (eErr == CE_None)
    1474             :                     {
    1475        8389 :                         eErr = poMEMBand->RasterIO(
    1476             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1477             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1478             :                             pDstBuffer, nDstXCount, nDstYCount,
    1479             :                             eDstBufferDataType, 0, 0, nullptr);
    1480             :                     }
    1481        8389 :                     CPLFree(pDstBuffer);
    1482             :                 }
    1483             : 
    1484       14109 :                 nBlocksDone++;
    1485       25031 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1486       10922 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1487             :                                              "", psExtraArg->pProgressData))
    1488             :                 {
    1489           1 :                     eErr = CE_Failure;
    1490             :                 }
    1491             :             }
    1492             :         }
    1493             : 
    1494       14109 :         CPLFree(pChunk);
    1495       14109 :         CPLFree(pabyChunkNoDataMask);
    1496             :     }
    1497             : 
    1498       14258 :     if (pTempBuffer)
    1499             :     {
    1500           4 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1501             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1502             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1503             :             nullptr));
    1504             :     }
    1505       14258 :     GDALClose(poMEMDS);
    1506       14258 :     VSIFree(pTempBuffer);
    1507             : 
    1508       14258 :     return eErr;
    1509             : }
    1510             : 
    1511             : /************************************************************************/
    1512             : /*                         RasterIOResampled()                          */
    1513             : /************************************************************************/
    1514             : 
    1515         892 : CPLErr GDALDataset::RasterIOResampled(
    1516             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1517             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1518             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1519             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1520             : 
    1521             : {
    1522             : #if 0
    1523             :     // Determine if we use warping resampling or overview resampling
    1524             :     bool bUseWarp = false;
    1525             :     if( GDALDataTypeIsComplex( eDataType ) )
    1526             :         bUseWarp = true;
    1527             : #endif
    1528             : 
    1529         892 :     double dfXOff = nXOff;
    1530         892 :     double dfYOff = nYOff;
    1531         892 :     double dfXSize = nXSize;
    1532         892 :     double dfYSize = nYSize;
    1533         892 :     if (psExtraArg->bFloatingPointWindowValidity)
    1534             :     {
    1535         765 :         dfXOff = psExtraArg->dfXOff;
    1536         765 :         dfYOff = psExtraArg->dfYOff;
    1537         765 :         dfXSize = psExtraArg->dfXSize;
    1538         765 :         dfYSize = psExtraArg->dfYSize;
    1539             :     }
    1540             : 
    1541         892 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1542         892 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1543             : 
    1544             :     // Determine the coordinates in the "virtual" output raster to see
    1545             :     // if there are not integers, in which case we will use them as a shift
    1546             :     // so that subwindow extracts give the exact same results as entire raster
    1547             :     // scaling.
    1548         892 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1549         892 :     bool bHasXOffVirtual = false;
    1550         892 :     int nDestXOffVirtual = 0;
    1551         892 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1552             :     {
    1553         767 :         bHasXOffVirtual = true;
    1554         767 :         dfXOff = nXOff;
    1555         767 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1556             :     }
    1557             : 
    1558         892 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1559         892 :     bool bHasYOffVirtual = false;
    1560         892 :     int nDestYOffVirtual = 0;
    1561         892 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1562             :     {
    1563         727 :         bHasYOffVirtual = true;
    1564         727 :         dfYOff = nYOff;
    1565         727 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1566             :     }
    1567             : 
    1568             :     // Create a MEM dataset that wraps the output buffer.
    1569         892 :     std::unique_ptr<void, VSIFreeReleaser> pTempBuffer;
    1570         892 :     GSpacing nPSMem = nPixelSpace;
    1571         892 :     GSpacing nLSMem = nLineSpace;
    1572         892 :     GSpacing nBandSpaceMEM = nBandSpace;
    1573         892 :     void *pDataMem = pData;
    1574         892 :     GDALDataType eDTMem = eBufType;
    1575         892 :     GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1576         892 :     const GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1577         892 :     if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
    1578             :     {
    1579           2 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1580           2 :         nLSMem = nPSMem * nBufXSize;
    1581           2 :         nBandSpaceMEM = nLSMem * nBandCount;
    1582           2 :         pTempBuffer.reset(VSI_MALLOC3_VERBOSE(nBandCount, nBufYSize,
    1583             :                                               static_cast<size_t>(nLSMem)));
    1584           2 :         if (pTempBuffer == nullptr)
    1585           0 :             return CE_Failure;
    1586           2 :         pDataMem = pTempBuffer.get();
    1587           2 :         eDTMem = eDataType;
    1588             :     }
    1589             : 
    1590             :     auto poMEMDS = std::unique_ptr<GDALDataset>(
    1591         892 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1592        1784 :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr));
    1593             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1594             :     std::vector<GDALRasterBand *> apoDstBands(nBandCount);
    1595             : #endif
    1596         892 :     int nNBITS = 0;
    1597        2896 :     for (int i = 0; i < nBandCount; i++)
    1598             :     {
    1599        2004 :         GByte *const pBandData = static_cast<GByte *>(pDataMem) -
    1600        2004 :                                  nPSMem * nDestXOffVirtual -
    1601        2004 :                                  nLSMem * nDestYOffVirtual + nBandSpaceMEM * i;
    1602        2004 :         auto poMEMBand = GDALRasterBand::FromHandle(MEMCreateRasterBandEx(
    1603             :             poMEMDS.get(), i + 1, pBandData, eDTMem, nPSMem, nLSMem, false));
    1604        2004 :         poMEMDS->SetBand(i + 1, poMEMBand);
    1605             : 
    1606        2004 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1607             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1608             :         apoDstBands[i] = poMEMBand;
    1609             : #endif
    1610             :         const char *pszNBITS =
    1611        2004 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1612        2004 :         if (pszNBITS)
    1613             :         {
    1614           0 :             nNBITS = atoi(pszNBITS);
    1615           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1616           0 :                                                            "IMAGE_STRUCTURE");
    1617             :         }
    1618             :     }
    1619             : 
    1620         892 :     CPLErr eErr = CE_None;
    1621             : 
    1622             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1623             :     // Looks like this code was initially added as disable by copying
    1624             :     // from RasterIO here:
    1625             :     // https://trac.osgeo.org/gdal/changeset/29572
    1626             : #if 0
    1627             :     // Do the resampling.
    1628             :     if( bUseWarp )
    1629             :     {
    1630             :         VRTDatasetH hVRTDS = nullptr;
    1631             :         GDALRasterBandH hVRTBand = nullptr;
    1632             :         if( GetDataset() == nullptr )
    1633             :         {
    1634             :             /* Create VRT dataset that wraps the whole dataset */
    1635             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1636             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1637             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1638             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1639             :                                 (GDALRasterBandH)this,
    1640             :                                 0, 0,
    1641             :                                 nRasterXSize, nRasterYSize,
    1642             :                                 0, 0,
    1643             :                                 nRasterXSize, nRasterYSize,
    1644             :                                 nullptr, VRT_NODATA_UNSET );
    1645             : 
    1646             :             /* Add a mask band if needed */
    1647             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1648             :             {
    1649             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1650             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1651             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1652             :                 poVRTMaskBand->
    1653             :                     AddMaskBandSource( this,
    1654             :                                     0, 0,
    1655             :                                     nRasterXSize, nRasterYSize,
    1656             :                                     0, 0,
    1657             :                                     nRasterXSize, nRasterYSize);
    1658             :             }
    1659             :         }
    1660             : 
    1661             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1662             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1663             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1664             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1665             :         psWarpOptions->nBandCount = 1;
    1666             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1667             :         int nDstBandNumber = 1;
    1668             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1669             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1670             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1671             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1672             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1673             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1674             :         GDALRasterIOTransformerStruct sTransformer;
    1675             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1676             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1677             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1678             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1679             :         psWarpOptions->pTransformerArg = &sTransformer;
    1680             : 
    1681             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1682             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1683             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1684             :                                       nBufXSize, nBufYSize );
    1685             :         GDALDestroyWarpOperation( hWarpOperation );
    1686             : 
    1687             :         psWarpOptions->panSrcBands = nullptr;
    1688             :         psWarpOptions->panDstBands = nullptr;
    1689             :         GDALDestroyWarpOptions( psWarpOptions );
    1690             : 
    1691             :         if( hVRTDS )
    1692             :             GDALClose(hVRTDS);
    1693             :     }
    1694             :     else
    1695             : #endif
    1696             :     {
    1697             :         const char *pszResampling =
    1698         892 :             GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
    1699             : 
    1700             :         int nBlockXSize, nBlockYSize;
    1701         892 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1702             : 
    1703             :         int nKernelRadius;
    1704             :         GDALResampleFunction pfnResampleFunc =
    1705         892 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1706         892 :         CPLAssert(pfnResampleFunc);
    1707             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1708             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1709             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1710             : #endif
    1711             :         GDALDataType eWrkDataType =
    1712         892 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1713             : 
    1714         892 :         int nDstBlockXSize = nBufXSize;
    1715         892 :         int nDstBlockYSize = nBufYSize;
    1716             :         int nFullResXChunk, nFullResYChunk;
    1717             :         while (true)
    1718             :         {
    1719         892 :             nFullResXChunk =
    1720         892 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1721         892 :             nFullResYChunk =
    1722         892 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1723         892 :             if (nFullResXChunk > nRasterXSize)
    1724         591 :                 nFullResXChunk = nRasterXSize;
    1725         892 :             if (nFullResYChunk > nRasterYSize)
    1726          57 :                 nFullResYChunk = nRasterYSize;
    1727         892 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1728         890 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1729             :                  1024 * 1024))
    1730             :                 break;
    1731             :             // When operating on the full width of a raster whose block width is
    1732             :             // the raster width, prefer doing chunks in height.
    1733           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1734             :                 nDstBlockYSize > 1)
    1735           0 :                 nDstBlockYSize /= 2;
    1736             :             /* Otherwise cut the maximal dimension */
    1737           0 :             else if (nDstBlockXSize > 1 &&
    1738           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1739           0 :                 nDstBlockXSize /= 2;
    1740             :             else
    1741           0 :                 nDstBlockYSize /= 2;
    1742             :         }
    1743             : 
    1744        1784 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1745         892 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1746         892 :         if (nOvrFactor == 0)
    1747         104 :             nOvrFactor = 1;
    1748         892 :         int nFullResXSizeQueried =
    1749         892 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1750         892 :         int nFullResYSizeQueried =
    1751         892 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1752             : 
    1753         892 :         if (nFullResXSizeQueried > nRasterXSize)
    1754         616 :             nFullResXSizeQueried = nRasterXSize;
    1755         892 :         if (nFullResYSizeQueried > nRasterYSize)
    1756          60 :             nFullResYSizeQueried = nRasterYSize;
    1757             : 
    1758         892 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1759             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1760             :                               nBandCount),
    1761             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1762         892 :         GByte *pabyChunkNoDataMask = nullptr;
    1763             : 
    1764         892 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1765         892 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1766             : 
    1767         892 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1768         892 :         if (bUseNoDataMask)
    1769             :         {
    1770         617 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1771             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1772             :         }
    1773         892 :         if (pChunk == nullptr ||
    1774         617 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1775             :         {
    1776           0 :             CPLFree(pChunk);
    1777           0 :             CPLFree(pabyChunkNoDataMask);
    1778           0 :             return CE_Failure;
    1779             :         }
    1780             : 
    1781         892 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1782         892 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1783         892 :         int nBlocksDone = 0;
    1784             : 
    1785             :         int nDstYOff;
    1786        1784 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1787         892 :              nDstYOff += nDstBlockYSize)
    1788             :         {
    1789             :             int nDstYCount;
    1790         892 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1791         892 :                 nDstYCount = nDstBlockYSize;
    1792             :             else
    1793           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1794             : 
    1795         892 :             int nChunkYOff =
    1796         892 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1797         892 :             int nChunkYOff2 = nYOff + 1 +
    1798         892 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1799             :                                                     dfYRatioDstToSrc));
    1800         892 :             if (nChunkYOff2 > nRasterYSize)
    1801         139 :                 nChunkYOff2 = nRasterYSize;
    1802         892 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1803         892 :             CPLAssert(nYCount <= nFullResYChunk);
    1804             : 
    1805         892 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1806         892 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1807         892 :             if (nChunkYOffQueried < 0)
    1808             :             {
    1809         142 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1810         142 :                 nChunkYOffQueried = 0;
    1811             :             }
    1812         892 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1813         157 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1814         892 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1815             : 
    1816             :             int nDstXOff;
    1817        1784 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1818         892 :                  nDstXOff += nDstBlockXSize)
    1819             :             {
    1820             :                 int nDstXCount;
    1821         892 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1822         892 :                     nDstXCount = nDstBlockXSize;
    1823             :                 else
    1824           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1825             : 
    1826         892 :                 int nChunkXOff =
    1827         892 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1828         892 :                 int nChunkXOff2 =
    1829         892 :                     nXOff + 1 +
    1830         892 :                     static_cast<int>(
    1831         892 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1832         892 :                 if (nChunkXOff2 > nRasterXSize)
    1833         647 :                     nChunkXOff2 = nRasterXSize;
    1834         892 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1835         892 :                 CPLAssert(nXCount <= nFullResXChunk);
    1836             : 
    1837         892 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1838         892 :                 int nChunkXSizeQueried =
    1839         892 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1840         892 :                 if (nChunkXOffQueried < 0)
    1841             :                 {
    1842         647 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1843         647 :                     nChunkXOffQueried = 0;
    1844             :                 }
    1845         892 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1846         655 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1847         892 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1848             : 
    1849         892 :                 bool bSkipResample = false;
    1850         892 :                 bool bNoDataMaskFullyOpaque = false;
    1851         892 :                 if (eErr == CE_None && bUseNoDataMask)
    1852             :                 {
    1853         617 :                     eErr = poMaskBand->RasterIO(
    1854             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1855             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1856             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1857             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1858             : 
    1859             :                     /* Optimizations if mask if fully opaque or transparent */
    1860         617 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1861         617 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1862         617 :                     int i = 1;  // Used after for.
    1863    48197000 :                     for (; i < nPixels; i++)
    1864             :                     {
    1865    48196500 :                         if (pabyChunkNoDataMask[i] != bVal)
    1866          72 :                             break;
    1867             :                     }
    1868         617 :                     if (i == nPixels)
    1869             :                     {
    1870         545 :                         if (bVal == 0)
    1871             :                         {
    1872         373 :                             GByte abyZero[16] = {0};
    1873         780 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1874             :                             {
    1875        3499 :                                 for (int j = 0; j < nDstYCount; j++)
    1876             :                                 {
    1877        3092 :                                     GDALCopyWords64(
    1878             :                                         abyZero, GDT_UInt8, 0,
    1879             :                                         static_cast<GByte *>(pDataMem) +
    1880        3092 :                                             iBand * nBandSpaceMEM +
    1881        3092 :                                             nLSMem * (j + nDstYOff) +
    1882        3092 :                                             nDstXOff * nPSMem,
    1883             :                                         eBufType, static_cast<int>(nPSMem),
    1884             :                                         nDstXCount);
    1885             :                                 }
    1886             :                             }
    1887         373 :                             bSkipResample = true;
    1888             :                         }
    1889             :                         else
    1890             :                         {
    1891         172 :                             bNoDataMaskFullyOpaque = true;
    1892             :                         }
    1893             :                     }
    1894             :                 }
    1895             : 
    1896         892 :                 if (!bSkipResample && eErr == CE_None)
    1897             :                 {
    1898             :                     /* Read the source buffers */
    1899         516 :                     eErr = RasterIO(
    1900             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1901             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1902             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1903             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1904             :                 }
    1905             : 
    1906             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1907             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1908             :                     eErr == CE_None)
    1909             :                 {
    1910             :                     eErr = pfnResampleFuncMultiBands(
    1911             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1912             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1913             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1914             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1915             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1916             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1917             :                         nChunkXSizeQueried,
    1918             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1919             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1920             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1921             :                         nDstYOff + nDestYOffVirtual,
    1922             :                         nDstYOff + nDestYOffVirtual + nDstYCount,
    1923             :                         apoDstBands.data(), pszResampling, FALSE /*bHasNoData*/,
    1924             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1925             :                         eDataType);
    1926             :                 }
    1927             :                 else
    1928             : #endif
    1929             :                 {
    1930             :                     size_t nChunkBandOffset =
    1931         892 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1932         892 :                         nChunkYSizeQueried *
    1933         892 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1934        2480 :                     for (int i = 0;
    1935        2480 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1936             :                          i++)
    1937             :                     {
    1938        1588 :                         const bool bPropagateNoData = false;
    1939        1588 :                         void *pDstBuffer = nullptr;
    1940        1588 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1941             :                         GDALRasterBand *poMEMBand =
    1942        1588 :                             poMEMDS->GetRasterBand(i + 1);
    1943        1588 :                         GDALOverviewResampleArgs args;
    1944        1588 :                         args.eSrcDataType = eDataType;
    1945        1588 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1946        1588 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1947        1588 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1948        1588 :                         args.nOvrNBITS = nNBITS;
    1949        1588 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1950        1588 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1951        1588 :                         args.dfSrcXDelta =
    1952        1588 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1953        1588 :                         args.dfSrcYDelta =
    1954        1588 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1955        1588 :                         args.eWrkDataType = eWrkDataType;
    1956        1588 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1957        1588 :                                                        ? nullptr
    1958             :                                                        : pabyChunkNoDataMask;
    1959        1588 :                         args.nChunkXOff =
    1960        1588 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1961        1588 :                         args.nChunkXSize = nChunkXSizeQueried;
    1962        1588 :                         args.nChunkYOff =
    1963        1588 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1964        1588 :                         args.nChunkYSize = nChunkYSizeQueried;
    1965        1588 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1966        1588 :                         args.nDstXOff2 =
    1967        1588 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1968        1588 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1969        1588 :                         args.nDstYOff2 =
    1970        1588 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1971        1588 :                         args.pszResampling = pszResampling;
    1972        1588 :                         args.bHasNoData = false;
    1973        1588 :                         args.dfNoDataValue = 0.0;
    1974        1588 :                         args.poColorTable = nullptr;
    1975        1588 :                         args.bPropagateNoData = bPropagateNoData;
    1976             : 
    1977             :                         eErr =
    1978        3176 :                             pfnResampleFunc(args,
    1979        1588 :                                             reinterpret_cast<GByte *>(pChunk) +
    1980        1588 :                                                 i * nChunkBandOffset,
    1981             :                                             &pDstBuffer, &eDstBufferDataType);
    1982        1588 :                         if (eErr == CE_None)
    1983             :                         {
    1984        1588 :                             eErr = poMEMBand->RasterIO(
    1985             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1986             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1987             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1988             :                                 eDstBufferDataType, 0, 0, nullptr);
    1989             :                         }
    1990        1588 :                         CPLFree(pDstBuffer);
    1991             :                     }
    1992             :                 }
    1993             : 
    1994         892 :                 nBlocksDone++;
    1995        1281 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1996         389 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1997             :                                              "", psExtraArg->pProgressData))
    1998             :                 {
    1999           0 :                     eErr = CE_Failure;
    2000             :                 }
    2001             :             }
    2002             :         }
    2003             : 
    2004         892 :         CPLFree(pChunk);
    2005         892 :         CPLFree(pabyChunkNoDataMask);
    2006             :     }
    2007             : 
    2008         892 :     if (pTempBuffer)
    2009             :     {
    2010           2 :         CPL_IGNORE_RET_VAL(poMEMDS->RasterIO(
    2011             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    2012             :             pData, nBufXSize, nBufYSize, eBufType, nBandCount, nullptr,
    2013             :             nPixelSpace, nLineSpace, nBandSpace, nullptr));
    2014             :     }
    2015             : 
    2016         892 :     return eErr;
    2017             : }
    2018             : 
    2019             : //! @endcond
    2020             : 
    2021             : /************************************************************************/
    2022             : /*                           GDALSwapWords()                            */
    2023             : /************************************************************************/
    2024             : 
    2025             : /**
    2026             :  * Byte swap words in-place.
    2027             :  *
    2028             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2029             :  * a memory array.  No assumption is made that the words being swapped are
    2030             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2031             :  * to determine if the current platform is big endian or little endian.  Use
    2032             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2033             :  * of a function call.
    2034             :  *
    2035             :  * @param pData pointer to start of data buffer.
    2036             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2037             :  * @param nWordCount the number of words to be swapped in this call.
    2038             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2039             :  * the next. For packed buffers this is the same as nWordSize.
    2040             :  */
    2041             : 
    2042      497149 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2043             :                                int nWordSkip)
    2044             : 
    2045             : {
    2046      497149 :     if (nWordCount > 0)
    2047      497149 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2048             : 
    2049      497149 :     GByte *pabyData = static_cast<GByte *>(pData);
    2050             : 
    2051      497149 :     switch (nWordSize)
    2052             :     {
    2053        7234 :         case 1:
    2054        7234 :             break;
    2055             : 
    2056      476905 :         case 2:
    2057      476905 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2058   228062000 :             for (int i = 0; i < nWordCount; i++)
    2059             :             {
    2060   227585000 :                 CPL_SWAP16PTR(pabyData);
    2061   227585000 :                 pabyData += nWordSkip;
    2062             :             }
    2063      476905 :             break;
    2064             : 
    2065       10584 :         case 4:
    2066       10584 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2067       10584 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2068             :             {
    2069    29140600 :                 for (int i = 0; i < nWordCount; i++)
    2070             :                 {
    2071    29130000 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2072             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2073    29130000 :                     pabyData += nWordSkip;
    2074       10581 :                 }
    2075             :             }
    2076             :             else
    2077             :             {
    2078           9 :                 for (int i = 0; i < nWordCount; i++)
    2079             :                 {
    2080           6 :                     CPL_SWAP32PTR(pabyData);
    2081           6 :                     pabyData += nWordSkip;
    2082             :                 }
    2083             :             }
    2084       10584 :             break;
    2085             : 
    2086        2426 :         case 8:
    2087        2426 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2088        2426 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2089             :             {
    2090     3356900 :                 for (int i = 0; i < nWordCount; i++)
    2091             :                 {
    2092     3354480 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2093             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2094     3354480 :                     pabyData += nWordSkip;
    2095        2425 :                 }
    2096             :             }
    2097             :             else
    2098             :             {
    2099           3 :                 for (int i = 0; i < nWordCount; i++)
    2100             :                 {
    2101           2 :                     CPL_SWAP64PTR(pabyData);
    2102           2 :                     pabyData += nWordSkip;
    2103             :                 }
    2104             :             }
    2105        2426 :             break;
    2106             : 
    2107           0 :         default:
    2108           0 :             CPLAssert(false);
    2109             :     }
    2110             : }
    2111             : 
    2112             : /************************************************************************/
    2113             : /*                          GDALSwapWordsEx()                           */
    2114             : /************************************************************************/
    2115             : 
    2116             : /**
    2117             :  * Byte swap words in-place.
    2118             :  *
    2119             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2120             :  * a memory array.  No assumption is made that the words being swapped are
    2121             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2122             :  * to determine if the current platform is big endian or little endian.  Use
    2123             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2124             :  * of a function call.
    2125             :  *
    2126             :  * @param pData pointer to start of data buffer.
    2127             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2128             :  * @param nWordCount the number of words to be swapped in this call.
    2129             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2130             :  * the next. For packed buffers this is the same as nWordSize.
    2131             :  */
    2132        6130 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2133             :                                  int nWordSkip)
    2134             : {
    2135        6130 :     GByte *pabyData = static_cast<GByte *>(pData);
    2136       12260 :     while (nWordCount)
    2137             :     {
    2138             :         // Pick-up a multiple of 8 as max chunk size.
    2139        6130 :         const int nWordCountSmall =
    2140        6130 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2141        6130 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2142        6130 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2143        6130 :         nWordCount -= nWordCountSmall;
    2144             :     }
    2145        6130 : }
    2146             : 
    2147             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2148             : namespace
    2149             : {
    2150             : 
    2151             : /************************************************************************/
    2152             : /*                           GDALCopyWordsT()                           */
    2153             : /************************************************************************/
    2154             : /**
    2155             :  * Template function, used to copy data from pSrcData into buffer
    2156             :  * pDstData, with stride nSrcPixelStride in the source data and
    2157             :  * stride nDstPixelStride in the destination data. This template can
    2158             :  * deal with the case where the input data type is real or complex and
    2159             :  * the output is real.
    2160             :  *
    2161             :  * @param pSrcData the source data buffer
    2162             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2163             :  *                      of interest.
    2164             :  * @param pDstData the destination buffer.
    2165             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2166             :  *                      interest.
    2167             :  * @param nWordCount the total number of pixel words to copy
    2168             :  *
    2169             :  * @code
    2170             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2171             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2172             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2173             :  * @endcode
    2174             :  * @note
    2175             :  * This is a private function, and should not be exposed outside of
    2176             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2177             :  */
    2178             : 
    2179             : template <class Tin, class Tout>
    2180    49013857 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2181             :                                          int nSrcPixelStride,
    2182             :                                          Tout *const CPL_RESTRICT pDstData,
    2183             :                                          int nDstPixelStride,
    2184             :                                          GPtrDiff_t nWordCount)
    2185             : {
    2186    49013857 :     decltype(nWordCount) nDstOffset = 0;
    2187             : 
    2188    49013857 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2189    49013857 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2190   356655113 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2191             :     {
    2192   307641208 :         const Tin tValue =
    2193   307641208 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2194   307641208 :         Tout *const pOutPixel =
    2195   307641208 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2196             : 
    2197   307641208 :         GDALCopyWord(tValue, *pOutPixel);
    2198             : 
    2199   307641208 :         nDstOffset += nDstPixelStride;
    2200             :     }
    2201    49013857 : }
    2202             : 
    2203             : template <class Tin, class Tout>
    2204    29776660 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2205             :                                         int nSrcPixelStride,
    2206             :                                         Tout *const CPL_RESTRICT pDstData,
    2207             :                                         int nDstPixelStride,
    2208             :                                         GPtrDiff_t nWordCount)
    2209             : {
    2210    29776660 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2211             :                           nWordCount);
    2212    29776660 : }
    2213             : 
    2214             : template <class Tin, class Tout>
    2215     5080935 : static void inline GDALCopyWordsT_8atatime(
    2216             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2217             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2218             :     GPtrDiff_t nWordCount)
    2219             : {
    2220     5080935 :     decltype(nWordCount) nDstOffset = 0;
    2221             : 
    2222     5080935 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2223     5080935 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2224     5080935 :     decltype(nWordCount) n = 0;
    2225     5080935 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2226             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2227             :     {
    2228    52932327 :         for (; n < nWordCount - 7; n += 8)
    2229             :         {
    2230    52390796 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2231    52390796 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2232    52390796 :             Tout *const pOutPixels =
    2233    52390796 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2234             : 
    2235    52390796 :             GDALCopy8Words(pInValues, pOutPixels);
    2236             : 
    2237    52390796 :             nDstOffset += 8 * nDstPixelStride;
    2238             :         }
    2239             :     }
    2240    10465987 :     for (; n < nWordCount; n++)
    2241             :     {
    2242     5385052 :         const Tin tValue =
    2243     5385052 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2244     5385052 :         Tout *const pOutPixel =
    2245     5385052 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2246             : 
    2247     5385052 :         GDALCopyWord(tValue, *pOutPixel);
    2248             : 
    2249     5385052 :         nDstOffset += nDstPixelStride;
    2250             :     }
    2251     5080935 : }
    2252             : 
    2253             : #ifdef HAVE_SSE2
    2254             : 
    2255             : template <class Tout>
    2256     1042126 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2257             :                               int nSrcPixelStride,
    2258             :                               Tout *const CPL_RESTRICT pDstData,
    2259             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2260             : {
    2261             :     static_assert(std::is_integral<Tout>::value &&
    2262             :                       sizeof(Tout) == sizeof(uint16_t),
    2263             :                   "Bad Tout");
    2264     1042126 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2265             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2266             :     {
    2267       35752 :         decltype(nWordCount) n = 0;
    2268       35752 :         const __m128i xmm_zero = _mm_setzero_si128();
    2269       35752 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2270             :             reinterpret_cast<GByte *>(pDstData);
    2271     1478148 :         for (; n < nWordCount - 15; n += 16)
    2272             :         {
    2273     1442396 :             __m128i xmm = _mm_loadu_si128(
    2274     1442396 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2275     1442396 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2276     1442396 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2277             :             _mm_storeu_si128(
    2278     1442396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2279             :             _mm_storeu_si128(
    2280     1442396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2281             :         }
    2282      111662 :         for (; n < nWordCount; n++)
    2283             :         {
    2284       75910 :             pDstData[n] = pSrcData[n];
    2285       35752 :         }
    2286             :     }
    2287             :     else
    2288             :     {
    2289     1006371 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2290             :                               nDstPixelStride, nWordCount);
    2291             :     }
    2292     1042126 : }
    2293             : 
    2294             : template <>
    2295     1029400 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2296             :                                  int nSrcPixelStride,
    2297             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    2298             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2299             : {
    2300     1029400 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2301             :                              nDstPixelStride, nWordCount);
    2302     1029400 : }
    2303             : 
    2304             : template <>
    2305       12726 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2306             :                                  int nSrcPixelStride,
    2307             :                                  GInt16 *const CPL_RESTRICT pDstData,
    2308             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2309             : {
    2310       12726 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2311             :                              nDstPixelStride, nWordCount);
    2312       12726 : }
    2313             : 
    2314             : template <class Tout>
    2315    16237076 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2316             :                               int nSrcPixelStride,
    2317             :                               Tout *const CPL_RESTRICT pDstData,
    2318             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2319             : {
    2320             :     static_assert(std::is_integral<Tout>::value &&
    2321             :                       sizeof(Tout) == sizeof(uint32_t),
    2322             :                   "Bad Tout");
    2323    16237076 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2324             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2325             :     {
    2326     6532586 :         decltype(nWordCount) n = 0;
    2327     6532586 :         const __m128i xmm_zero = _mm_setzero_si128();
    2328     6532586 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2329             :             reinterpret_cast<GByte *>(pDstData);
    2330    74248027 :         for (; n < nWordCount - 15; n += 16)
    2331             :         {
    2332    67715361 :             __m128i xmm = _mm_loadu_si128(
    2333    67715361 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2334    67715361 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2335    67715361 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2336    67715361 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2337    67715361 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2338    67715361 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2339    67715361 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2340             :             _mm_storeu_si128(
    2341    67715361 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2342             :             _mm_storeu_si128(
    2343    67715361 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2344             :             _mm_storeu_si128(
    2345    67715361 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2346             :             _mm_storeu_si128(
    2347    67715361 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2348             :         }
    2349    14825816 :         for (; n < nWordCount; n++)
    2350             :         {
    2351     8293240 :             pDstData[n] = pSrcData[n];
    2352     6532586 :         }
    2353             :     }
    2354             :     else
    2355             :     {
    2356     9704510 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2357             :                               nDstPixelStride, nWordCount);
    2358             :     }
    2359    16237076 : }
    2360             : 
    2361             : template <>
    2362         476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2363             :                                  int nSrcPixelStride,
    2364             :                                  GUInt32 *const CPL_RESTRICT pDstData,
    2365             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2366             : {
    2367         476 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2368             :                              nDstPixelStride, nWordCount);
    2369         476 : }
    2370             : 
    2371             : template <>
    2372    16236600 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2373             :                                  int nSrcPixelStride,
    2374             :                                  GInt32 *const CPL_RESTRICT pDstData,
    2375             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2376             : {
    2377    16236600 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2378             :                              nDstPixelStride, nWordCount);
    2379    16236600 : }
    2380             : 
    2381             : template <>
    2382     2851070 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2383             :                                  int nSrcPixelStride,
    2384             :                                  float *const CPL_RESTRICT pDstData,
    2385             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2386             : {
    2387     2851070 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2388             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2389             :     {
    2390      228189 :         decltype(nWordCount) n = 0;
    2391      228189 :         const __m128i xmm_zero = _mm_setzero_si128();
    2392      228189 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2393             :             reinterpret_cast<GByte *>(pDstData);
    2394     2267160 :         for (; n < nWordCount - 15; n += 16)
    2395             :         {
    2396     2038970 :             __m128i xmm = _mm_loadu_si128(
    2397     2038970 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2398     2038970 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2399     2038970 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2400     2038970 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2401     2038970 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2402     2038970 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2403     2038970 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2404     2038970 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2405     2038970 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2406     2038970 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2407     2038970 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2408     2038970 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2409             :                           xmm0_f);
    2410             :             _mm_storeu_ps(
    2411     2038970 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2412             :             _mm_storeu_ps(
    2413     2038970 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2414             :             _mm_storeu_ps(
    2415     2038970 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2416             :         }
    2417      951437 :         for (; n < nWordCount; n++)
    2418             :         {
    2419      723248 :             pDstData[n] = pSrcData[n];
    2420      228189 :         }
    2421             :     }
    2422             :     else
    2423             :     {
    2424     2622880 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2425             :                               nDstPixelStride, nWordCount);
    2426             :     }
    2427     2851070 : }
    2428             : 
    2429             : template <>
    2430      170938 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2431             :                                  int nSrcPixelStride,
    2432             :                                  double *const CPL_RESTRICT pDstData,
    2433             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2434             : {
    2435      170938 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2436             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2437             :     {
    2438      147140 :         decltype(nWordCount) n = 0;
    2439      147140 :         const __m128i xmm_zero = _mm_setzero_si128();
    2440      147140 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2441             :             reinterpret_cast<GByte *>(pDstData);
    2442     3127410 :         for (; n < nWordCount - 15; n += 16)
    2443             :         {
    2444     2980270 :             __m128i xmm = _mm_loadu_si128(
    2445     2980270 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2446     2980270 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2447     2980270 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2448     2980270 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2449     2980270 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2450     2980270 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2451     2980270 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2452             : 
    2453             : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
    2454             :             _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2455             :                              _mm256_cvtepi32_pd(xmm0));
    2456             :             _mm256_storeu_pd(
    2457             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2458             :                 _mm256_cvtepi32_pd(xmm1));
    2459             :             _mm256_storeu_pd(
    2460             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2461             :                 _mm256_cvtepi32_pd(xmm2));
    2462             :             _mm256_storeu_pd(
    2463             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2464             :                 _mm256_cvtepi32_pd(xmm3));
    2465             : #else
    2466     2980270 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2467     2980270 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2468     2980270 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2469     2980270 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2470     2980270 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2471     2980270 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2472     2980270 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2473     2980270 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2474     2980270 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2475     2980270 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2476     2980270 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2477     2980270 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2478             : 
    2479     2980270 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2480             :                           xmm0_low_d);
    2481             :             _mm_storeu_pd(
    2482     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2483             :                 xmm0_high_d);
    2484             :             _mm_storeu_pd(
    2485     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2486             :                 xmm1_low_d);
    2487             :             _mm_storeu_pd(
    2488     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2489             :                 xmm1_high_d);
    2490             :             _mm_storeu_pd(
    2491     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2492             :                 xmm2_low_d);
    2493             :             _mm_storeu_pd(
    2494     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2495             :                 xmm2_high_d);
    2496             :             _mm_storeu_pd(
    2497     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2498             :                 xmm3_low_d);
    2499             :             _mm_storeu_pd(
    2500     2980270 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2501             :                 xmm3_high_d);
    2502             : #endif
    2503             :         }
    2504      280823 :         for (; n < nWordCount; n++)
    2505             :         {
    2506      133683 :             pDstData[n] = pSrcData[n];
    2507      147140 :         }
    2508             :     }
    2509             :     else
    2510             :     {
    2511       23798 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2512             :                               nDstPixelStride, nWordCount);
    2513             :     }
    2514      170938 : }
    2515             : 
    2516             : template <>
    2517         148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
    2518             :                                  int nSrcPixelStride,
    2519             :                                  int8_t *const CPL_RESTRICT pDstData,
    2520             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2521             : {
    2522         148 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2523             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2524             :     {
    2525         142 :         decltype(nWordCount) n = 0;
    2526         142 :         const __m128i xmm_127 = _mm_set1_epi8(127);
    2527         146 :         for (; n < nWordCount - 31; n += 32)
    2528             :         {
    2529           8 :             __m128i xmm0 = _mm_loadu_si128(
    2530           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2531           4 :             __m128i xmm1 = _mm_loadu_si128(
    2532           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2533           4 :             xmm0 = _mm_min_epu8(xmm0, xmm_127);
    2534           4 :             xmm1 = _mm_min_epu8(xmm1, xmm_127);
    2535           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2536           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2537             :                              xmm1);
    2538             :         }
    2539        2424 :         for (; n < nWordCount; n++)
    2540             :         {
    2541        2282 :             pDstData[n] =
    2542        2282 :                 pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
    2543         142 :         }
    2544             :     }
    2545             :     else
    2546             :     {
    2547           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2548             :                               nDstPixelStride, nWordCount);
    2549             :     }
    2550         148 : }
    2551             : 
    2552             : template <>
    2553          62 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
    2554             :                                  int nSrcPixelStride,
    2555             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2556             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2557             : {
    2558          62 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2559             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2560             :     {
    2561          56 :         decltype(nWordCount) n = 0;
    2562             : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
    2563          56 :         const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
    2564             : #endif
    2565         117 :         for (; n < nWordCount - 31; n += 32)
    2566             :         {
    2567         122 :             __m128i xmm0 = _mm_loadu_si128(
    2568          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2569          61 :             __m128i xmm1 = _mm_loadu_si128(
    2570          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2571             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2572             :             xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
    2573             :             xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
    2574             : #else
    2575          61 :             xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
    2576          61 :             xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
    2577          61 :             xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
    2578          61 :             xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
    2579          61 :             xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
    2580          61 :             xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
    2581             : #endif
    2582          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2583          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2584             :                              xmm1);
    2585             :         }
    2586         352 :         for (; n < nWordCount; n++)
    2587             :         {
    2588         296 :             pDstData[n] =
    2589         296 :                 pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
    2590          56 :         }
    2591             :     }
    2592             :     else
    2593             :     {
    2594           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2595             :                               nDstPixelStride, nWordCount);
    2596             :     }
    2597          62 : }
    2598             : 
    2599             : template <>
    2600        6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2601             :                                  int nSrcPixelStride,
    2602             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2603             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2604             : {
    2605        6037 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2606             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2607             :     {
    2608        5062 :         decltype(nWordCount) n = 0;
    2609             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2610             :         const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2611             : #else
    2612             :         // In SSE2, min_epu16 does not exist, so shift from
    2613             :         // UInt16 to SInt16 to be able to use min_epi16
    2614        5062 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2615        5062 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2616             : #endif
    2617       71888 :         for (; n < nWordCount - 15; n += 16)
    2618             :         {
    2619      133652 :             __m128i xmm0 = _mm_loadu_si128(
    2620       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2621       66826 :             __m128i xmm1 = _mm_loadu_si128(
    2622       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2623             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2624             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2625             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2626             : #else
    2627       66826 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2628       66826 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2629       66826 :             xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
    2630       66826 :             xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
    2631       66826 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2632       66826 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2633             : #endif
    2634       66826 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    2635       66826 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2636             :         }
    2637       16403 :         for (; n < nWordCount; n++)
    2638             :         {
    2639       11341 :             pDstData[n] =
    2640       11341 :                 pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
    2641        5062 :         }
    2642             :     }
    2643             :     else
    2644             :     {
    2645         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2646             :                               nDstPixelStride, nWordCount);
    2647             :     }
    2648        6037 : }
    2649             : 
    2650             : template <>
    2651          46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2652             :                                  int nSrcPixelStride,
    2653             :                                  int16_t *const CPL_RESTRICT pDstData,
    2654             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2655             : {
    2656          46 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2657             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2658             :     {
    2659          40 :         decltype(nWordCount) n = 0;
    2660             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2661             :         const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2662             : #else
    2663             :         // In SSE2, min_epu16 does not exist, so shift from
    2664             :         // UInt16 to SInt16 to be able to use min_epi16
    2665          40 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2666          40 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2667             : #endif
    2668         169 :         for (; n < nWordCount - 15; n += 16)
    2669             :         {
    2670         258 :             __m128i xmm0 = _mm_loadu_si128(
    2671         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2672         129 :             __m128i xmm1 = _mm_loadu_si128(
    2673         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2674             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2675             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2676             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2677             : #else
    2678         129 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2679         129 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2680         129 :             xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
    2681         129 :             xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
    2682         129 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2683         129 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2684             : #endif
    2685         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2686         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2687             :                              xmm1);
    2688             :         }
    2689         191 :         for (; n < nWordCount; n++)
    2690             :         {
    2691         282 :             pDstData[n] = pSrcData[n] >= 32767
    2692             :                               ? 32767
    2693         131 :                               : static_cast<int16_t>(pSrcData[n]);
    2694          40 :         }
    2695             :     }
    2696             :     else
    2697             :     {
    2698           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2699             :                               nDstPixelStride, nWordCount);
    2700             :     }
    2701          46 : }
    2702             : 
    2703             : template <>
    2704         136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2705             :                                  int nSrcPixelStride,
    2706             :                                  uint16_t *const CPL_RESTRICT pDstData,
    2707             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2708             : {
    2709         136 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2710             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2711             :     {
    2712          93 :         decltype(nWordCount) n = 0;
    2713          93 :         const __m128i xmm_zero = _mm_setzero_si128();
    2714         278 :         for (; n < nWordCount - 15; n += 16)
    2715             :         {
    2716         370 :             __m128i xmm0 = _mm_loadu_si128(
    2717         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2718         185 :             __m128i xmm1 = _mm_loadu_si128(
    2719         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2720         185 :             xmm0 = _mm_max_epi16(xmm0, xmm_zero);
    2721         185 :             xmm1 = _mm_max_epi16(xmm1, xmm_zero);
    2722         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2723         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2724             :                              xmm1);
    2725             :         }
    2726         471 :         for (; n < nWordCount; n++)
    2727             :         {
    2728         378 :             pDstData[n] =
    2729         378 :                 pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
    2730          93 :         }
    2731             :     }
    2732             :     else
    2733             :     {
    2734          43 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2735             :                               nDstPixelStride, nWordCount);
    2736             :     }
    2737         136 : }
    2738             : 
    2739             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2740             : 
    2741             : template <>
    2742             : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
    2743             :                                  int nSrcPixelStride,
    2744             :                                  int32_t *const CPL_RESTRICT pDstData,
    2745             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2746             : {
    2747             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2748             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2749             :     {
    2750             :         decltype(nWordCount) n = 0;
    2751             :         const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
    2752             :         for (; n < nWordCount - 8; n += 7)
    2753             :         {
    2754             :             __m128i xmm0 = _mm_loadu_si128(
    2755             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2756             :             __m128i xmm1 = _mm_loadu_si128(
    2757             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2758             :             xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
    2759             :             xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
    2760             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2761             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2762             :                              xmm1);
    2763             :         }
    2764             :         for (; n < nWordCount; n++)
    2765             :         {
    2766             :             pDstData[n] = pSrcData[n] >= INT_MAX
    2767             :                               ? INT_MAX
    2768             :                               : static_cast<int32_t>(pSrcData[n]);
    2769             :         }
    2770             :     }
    2771             :     else
    2772             :     {
    2773             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2774             :                               nDstPixelStride, nWordCount);
    2775             :     }
    2776             : }
    2777             : 
    2778             : template <>
    2779             : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    2780             :                                  int nSrcPixelStride,
    2781             :                                  uint32_t *const CPL_RESTRICT pDstData,
    2782             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2783             : {
    2784             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2785             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2786             :     {
    2787             :         decltype(nWordCount) n = 0;
    2788             :         const __m128i xmm_zero = _mm_setzero_si128();
    2789             :         for (; n < nWordCount - 7; n += 8)
    2790             :         {
    2791             :             __m128i xmm0 = _mm_loadu_si128(
    2792             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2793             :             __m128i xmm1 = _mm_loadu_si128(
    2794             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2795             :             xmm0 = _mm_max_epi32(xmm0, xmm_zero);
    2796             :             xmm1 = _mm_max_epi32(xmm1, xmm_zero);
    2797             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2798             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2799             :                              xmm1);
    2800             :         }
    2801             :         for (; n < nWordCount; n++)
    2802             :         {
    2803             :             pDstData[n] =
    2804             :                 pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
    2805             :         }
    2806             :     }
    2807             :     else
    2808             :     {
    2809             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2810             :                               nDstPixelStride, nWordCount);
    2811             :     }
    2812             : }
    2813             : 
    2814             : #endif  // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2815             : 
    2816             : template <>
    2817         403 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2818             :                                  int nSrcPixelStride,
    2819             :                                  float *const CPL_RESTRICT pDstData,
    2820             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2821             : {
    2822         403 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2823             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2824             :     {
    2825         397 :         decltype(nWordCount) n = 0;
    2826         397 :         const __m128i xmm_zero = _mm_setzero_si128();
    2827         397 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2828             :             reinterpret_cast<GByte *>(pDstData);
    2829        1688 :         for (; n < nWordCount - 7; n += 8)
    2830             :         {
    2831        1291 :             __m128i xmm = _mm_loadu_si128(
    2832        1291 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2833        1291 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2834        1291 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2835        1291 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2836        1291 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2837        1291 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2838             :                           xmm0_f);
    2839             :             _mm_storeu_ps(
    2840        1291 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2841             :         }
    2842        1415 :         for (; n < nWordCount; n++)
    2843             :         {
    2844        1018 :             pDstData[n] = pSrcData[n];
    2845         397 :         }
    2846             :     }
    2847             :     else
    2848             :     {
    2849           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2850             :                               nDstPixelStride, nWordCount);
    2851             :     }
    2852         403 : }
    2853             : 
    2854             : template <>
    2855     1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2856             :                                  int nSrcPixelStride,
    2857             :                                  float *const CPL_RESTRICT pDstData,
    2858             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2859             : {
    2860     1076640 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2861             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2862             :     {
    2863       86742 :         decltype(nWordCount) n = 0;
    2864       86742 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2865             :             reinterpret_cast<GByte *>(pDstData);
    2866      586119 :         for (; n < nWordCount - 7; n += 8)
    2867             :         {
    2868      499377 :             __m128i xmm = _mm_loadu_si128(
    2869      499377 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2870      499377 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2871      499377 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2872      499377 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2873      499377 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2874      499377 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2875      499377 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2876             :                           xmm0_f);
    2877             :             _mm_storeu_ps(
    2878      499377 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2879             :         }
    2880      253882 :         for (; n < nWordCount; n++)
    2881             :         {
    2882      167140 :             pDstData[n] = pSrcData[n];
    2883       86742 :         }
    2884             :     }
    2885             :     else
    2886             :     {
    2887      989901 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2888             :                               nDstPixelStride, nWordCount);
    2889             :     }
    2890     1076640 : }
    2891             : 
    2892             : template <>
    2893         449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2894             :                                  int nSrcPixelStride,
    2895             :                                  double *const CPL_RESTRICT pDstData,
    2896             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2897             : {
    2898         449 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2899             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2900             :     {
    2901         313 :         decltype(nWordCount) n = 0;
    2902         313 :         const __m128i xmm_zero = _mm_setzero_si128();
    2903         313 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2904             :             reinterpret_cast<GByte *>(pDstData);
    2905         829 :         for (; n < nWordCount - 7; n += 8)
    2906             :         {
    2907         516 :             __m128i xmm = _mm_loadu_si128(
    2908         516 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2909         516 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2910         516 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2911             : 
    2912         516 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2913         516 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2914         516 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2915         516 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2916         516 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2917         516 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2918             : 
    2919         516 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2920             :                           xmm0_low_d);
    2921             :             _mm_storeu_pd(
    2922         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2923             :                 xmm0_high_d);
    2924             :             _mm_storeu_pd(
    2925         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2926             :                 xmm1_low_d);
    2927             :             _mm_storeu_pd(
    2928         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2929             :                 xmm1_high_d);
    2930             :         }
    2931        1082 :         for (; n < nWordCount; n++)
    2932             :         {
    2933         769 :             pDstData[n] = pSrcData[n];
    2934         313 :         }
    2935             :     }
    2936             :     else
    2937             :     {
    2938         136 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2939             :                               nDstPixelStride, nWordCount);
    2940             :     }
    2941         449 : }
    2942             : 
    2943             : template <>
    2944     4923280 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2945             :                                  int nSrcPixelStride,
    2946             :                                  double *const CPL_RESTRICT pDstData,
    2947             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2948             : {
    2949     4923280 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2950             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2951             :     {
    2952       34874 :         decltype(nWordCount) n = 0;
    2953       34874 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2954             :             reinterpret_cast<GByte *>(pDstData);
    2955      403828 :         for (; n < nWordCount - 7; n += 8)
    2956             :         {
    2957      368954 :             __m128i xmm = _mm_loadu_si128(
    2958      368954 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2959      368954 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2960      368954 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2961      368954 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2962             : 
    2963      368954 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2964      368954 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2965      368954 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2966      368954 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2967      368954 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2968      368954 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2969             : 
    2970      368954 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2971             :                           xmm0_low_d);
    2972             :             _mm_storeu_pd(
    2973      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2974             :                 xmm0_high_d);
    2975             :             _mm_storeu_pd(
    2976      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2977             :                 xmm1_low_d);
    2978             :             _mm_storeu_pd(
    2979      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2980             :                 xmm1_high_d);
    2981             :         }
    2982      255934 :         for (; n < nWordCount; n++)
    2983             :         {
    2984      221060 :             pDstData[n] = pSrcData[n];
    2985       34874 :         }
    2986             :     }
    2987             :     else
    2988             :     {
    2989     4888400 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2990             :                               nDstPixelStride, nWordCount);
    2991             :     }
    2992     4923280 : }
    2993             : 
    2994             : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
    2995             : 
    2996             : #if defined(HAVE_AVX2_DISPATCH)
    2997             : #if !defined(_MSC_VER)
    2998             : __attribute__((target("avx2")))
    2999             : #endif
    3000       12723 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
    3001             :                                            uint8_t *CPL_RESTRICT pDst,
    3002             :                                            GPtrDiff_t nWordCount)
    3003             : {
    3004       12723 :     const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
    3005       12723 :     GPtrDiff_t n = 0;
    3006      958119 :     for (; n < nWordCount - 31; n += 32)
    3007             :     {
    3008             :         __m256i v0 =
    3009      945396 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
    3010             :         __m256i v1 =
    3011      945396 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
    3012      945396 :         __m256i v2 = _mm256_loadu_si256(
    3013      945396 :             reinterpret_cast<const __m256i *>(pSrc + n + 16));
    3014      945396 :         __m256i v3 = _mm256_loadu_si256(
    3015      945396 :             reinterpret_cast<const __m256i *>(pSrc + n + 24));
    3016             :         // Clamp to [0, 255]
    3017             :         // Pack int32 -> int16 -> uint8, then fix cross-lane ordering
    3018      945396 :         __m256i ab16 = _mm256_packs_epi32(v0, v1);
    3019      945396 :         __m256i cd16 = _mm256_packs_epi32(v2, v3);
    3020      945396 :         __m256i bytes = _mm256_packus_epi16(ab16, cd16);
    3021      945396 :         bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
    3022      945396 :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
    3023             :     }
    3024       68589 :     for (; n < nWordCount; n++)
    3025             :     {
    3026       70955 :         pDst[n] = pSrc[n] <= 0     ? 0
    3027       15089 :                   : pSrc[n] >= 255 ? 255
    3028        1075 :                                    : static_cast<uint8_t>(pSrc[n]);
    3029             :     }
    3030       12723 : }
    3031             : 
    3032             : #if !defined(_MSC_VER)
    3033             : __attribute__((target("avx2")))
    3034             : #endif
    3035       10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
    3036             :                                             uint16_t *CPL_RESTRICT pDst,
    3037             :                                             GPtrDiff_t nWordCount)
    3038             : {
    3039             :     // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
    3040             :     //   [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
    3041             :     // Permute to deinterleave: all v0 values first, then all v1 values
    3042       10277 :     const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
    3043       10277 :     GPtrDiff_t n = 0;
    3044      670572 :     for (; n < nWordCount - 15; n += 16)
    3045             :     {
    3046             :         __m256i v0 =
    3047      660295 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
    3048             :         __m256i v1 =
    3049     1320590 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
    3050             :         // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
    3051      660295 :         __m256i packed = _mm256_packus_epi32(v0, v1);
    3052             :         // Fix cross-lane interleave from packus
    3053      660295 :         packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
    3054      660295 :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
    3055             :     }
    3056      163928 :     for (; n < nWordCount; n++)
    3057             :     {
    3058      307282 :         pDst[n] = pSrc[n] <= 0       ? 0
    3059      153631 :                   : pSrc[n] >= 65535 ? 65535
    3060      153599 :                                      : static_cast<uint16_t>(pSrc[n]);
    3061             :     }
    3062       10277 : }
    3063             : #endif  // HAVE_AVX2_DISPATCH
    3064             : 
    3065             : // ---- int32 -> uint8 with clamping to [0, 255] ----
    3066             : template <>
    3067       12837 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3068             :                                  int nSrcPixelStride,
    3069             :                                  uint8_t *const CPL_RESTRICT pDstData,
    3070             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3071             : {
    3072       12837 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3073             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3074             :     {
    3075             : #if defined(HAVE_AVX2_DISPATCH)
    3076       12723 :         if (CPLHaveRuntimeAVX2())
    3077             :         {
    3078       12723 :             GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
    3079       12723 :             return;
    3080             :         }
    3081             : #endif
    3082             : #ifdef HAVE_SSE2
    3083             :         // SSE2 path: 16 pixels per iteration
    3084           0 :         decltype(nWordCount) n = 0;
    3085           0 :         for (; n < nWordCount - 15; n += 16)
    3086             :         {
    3087           0 :             __m128i v0 = _mm_loadu_si128(
    3088           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3089           0 :             __m128i v1 = _mm_loadu_si128(
    3090           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3091           0 :             __m128i v2 = _mm_loadu_si128(
    3092           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    3093           0 :             __m128i v3 = _mm_loadu_si128(
    3094           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 12));
    3095             :             // Values in [0, 255]: pack int32->int16->uint8
    3096           0 :             __m128i lo16 = _mm_packs_epi32(v0, v1);
    3097           0 :             __m128i hi16 = _mm_packs_epi32(v2, v3);
    3098           0 :             __m128i bytes = _mm_packus_epi16(lo16, hi16);
    3099           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
    3100             :         }
    3101           0 :         for (; n < nWordCount; n++)
    3102             : #else
    3103             :         for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3104             : #endif
    3105             :         {
    3106           0 :             pDstData[n] = pSrcData[n] <= 0 ? 0
    3107           0 :                           : pSrcData[n] >= 255
    3108             :                               ? 255
    3109           0 :                               : static_cast<uint8_t>(pSrcData[n]);
    3110           0 :         }
    3111             :     }
    3112             :     else
    3113             :     {
    3114         114 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3115             :                               nDstPixelStride, nWordCount);
    3116             :     }
    3117             : }
    3118             : 
    3119             : // ---- int32 -> uint16 with clamping to [0, 65535] ----
    3120             : template <>
    3121       10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3122             :                                  int nSrcPixelStride,
    3123             :                                  uint16_t *const CPL_RESTRICT pDstData,
    3124             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3125             : {
    3126       10322 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3127             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3128             :     {
    3129             : #if defined(HAVE_AVX2_DISPATCH)
    3130       10277 :         if (CPLHaveRuntimeAVX2())
    3131             :         {
    3132       10277 :             GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
    3133       10277 :             return;
    3134             :         }
    3135             : #endif
    3136           0 :         decltype(nWordCount) n = 0;
    3137             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    3138             :         // SSE4.1: _mm_packus_epi32 directly handles uint saturation
    3139             :         for (; n < nWordCount - 7; n += 8)
    3140             :         {
    3141             :             __m128i v0 = _mm_loadu_si128(
    3142             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3143             :             __m128i v1 = _mm_loadu_si128(
    3144             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3145             :             __m128i packed = _mm_packus_epi32(v0, v1);
    3146             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
    3147             :         }
    3148             : #else
    3149             :         // SSE2: clamp to [0, 65535], bias to signed range, pack, unbias
    3150           0 :         const __m128i xmm_65535 = _mm_set1_epi32(65535);
    3151           0 :         const __m128i xmm_bias32 = _mm_set1_epi32(32768);
    3152           0 :         const __m128i xmm_bias16 = _mm_set1_epi16(-32768);
    3153           0 :         for (; n < nWordCount - 7; n += 8)
    3154             :         {
    3155           0 :             __m128i v0 = _mm_loadu_si128(
    3156           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3157           0 :             __m128i v1 = _mm_loadu_si128(
    3158           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3159             :             // max(v, 0)
    3160           0 :             v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
    3161           0 :             v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
    3162             :             // min(v, 65535)
    3163           0 :             __m128i gt0 = _mm_cmpgt_epi32(v0, xmm_65535);
    3164           0 :             __m128i gt1 = _mm_cmpgt_epi32(v1, xmm_65535);
    3165           0 :             v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
    3166             :                               _mm_and_si128(gt0, xmm_65535));
    3167           0 :             v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
    3168             :                               _mm_and_si128(gt1, xmm_65535));
    3169             :             // Shift [0, 65535] -> [-32768, 32767] for _mm_packs_epi32
    3170           0 :             v0 = _mm_sub_epi32(v0, xmm_bias32);
    3171           0 :             v1 = _mm_sub_epi32(v1, xmm_bias32);
    3172           0 :             __m128i packed = _mm_packs_epi32(v0, v1);
    3173             :             // Shift back: sub_epi16(x, -32768) == add 32768 (mod 2^16)
    3174           0 :             packed = _mm_sub_epi16(packed, xmm_bias16);
    3175           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
    3176             :         }
    3177             : #endif
    3178           0 :         for (; n < nWordCount; n++)
    3179             :         {
    3180           0 :             pDstData[n] = pSrcData[n] <= 0 ? 0
    3181           0 :                           : pSrcData[n] >= 65535
    3182             :                               ? 65535
    3183           0 :                               : static_cast<uint16_t>(pSrcData[n]);
    3184           0 :         }
    3185             :     }
    3186             :     else
    3187             :     {
    3188          45 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3189             :                               nDstPixelStride, nWordCount);
    3190             :     }
    3191             : }
    3192             : 
    3193             : #endif  // HAVE_SSE2
    3194             : 
    3195             : template <>
    3196     4426980 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3197             :                                  int nSrcPixelStride,
    3198             :                                  GByte *const CPL_RESTRICT pDstData,
    3199             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3200             : {
    3201     4426980 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3202             :                             nDstPixelStride, nWordCount);
    3203     4426980 : }
    3204             : 
    3205             : template <>
    3206       38387 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3207             :                                  int nSrcPixelStride,
    3208             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3209             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3210             : {
    3211       38387 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3212             :                             nDstPixelStride, nWordCount);
    3213       38387 : }
    3214             : 
    3215             : template <>
    3216       55671 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3217             :                                  int nSrcPixelStride,
    3218             :                                  double *const CPL_RESTRICT pDstData,
    3219             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3220             : {
    3221       55671 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3222             :                             nDstPixelStride, nWordCount);
    3223       55671 : }
    3224             : 
    3225             : template <>
    3226      122845 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3227             :                                  int nSrcPixelStride,
    3228             :                                  float *const CPL_RESTRICT pDstData,
    3229             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3230             : {
    3231      122845 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3232             :                             nDstPixelStride, nWordCount);
    3233      122845 : }
    3234             : 
    3235             : template <>
    3236         412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3237             :                                  int nSrcPixelStride,
    3238             :                                  float *const CPL_RESTRICT pDstData,
    3239             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3240             : {
    3241         412 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3242             :                             nDstPixelStride, nWordCount);
    3243         412 : }
    3244             : 
    3245             : template <>
    3246         544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3247             :                                  int nSrcPixelStride,
    3248             :                                  double *const CPL_RESTRICT pDstData,
    3249             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3250             : {
    3251         544 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3252             :                             nDstPixelStride, nWordCount);
    3253         544 : }
    3254             : 
    3255             : template <>
    3256      314423 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3257             :                                  int nSrcPixelStride,
    3258             :                                  GByte *const CPL_RESTRICT pDstData,
    3259             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3260             : {
    3261      314423 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3262             :                             nDstPixelStride, nWordCount);
    3263      314423 : }
    3264             : 
    3265             : template <>
    3266          55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3267             :                                  int nSrcPixelStride,
    3268             :                                  GInt8 *const CPL_RESTRICT pDstData,
    3269             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3270             : {
    3271          55 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3272             :                             nDstPixelStride, nWordCount);
    3273          55 : }
    3274             : 
    3275             : template <>
    3276       15785 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3277             :                                  int nSrcPixelStride,
    3278             :                                  GInt16 *const CPL_RESTRICT pDstData,
    3279             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3280             : {
    3281       15785 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3282             :                             nDstPixelStride, nWordCount);
    3283       15785 : }
    3284             : 
    3285             : template <>
    3286       61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3287             :                                  int nSrcPixelStride,
    3288             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3289             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3290             : {
    3291       61713 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3292             :                             nDstPixelStride, nWordCount);
    3293       61713 : }
    3294             : 
    3295             : template <>
    3296       43985 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3297             :                                  int nSrcPixelStride,
    3298             :                                  GInt32 *const CPL_RESTRICT pDstData,
    3299             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3300             : {
    3301       43985 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3302             :                             nDstPixelStride, nWordCount);
    3303       43985 : }
    3304             : 
    3305             : template <>
    3306          72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3307             :                                  int nSrcPixelStride,
    3308             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3309             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3310             : {
    3311          72 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3312             :                             nDstPixelStride, nWordCount);
    3313          72 : }
    3314             : 
    3315             : template <>
    3316          63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3317             :                                  int nSrcPixelStride,
    3318             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3319             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3320             : {
    3321          63 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3322             :                             nDstPixelStride, nWordCount);
    3323          63 : }
    3324             : 
    3325             : /************************************************************************/
    3326             : /*                       GDALCopyWordsComplexT()                        */
    3327             : /************************************************************************/
    3328             : /**
    3329             :  * Template function, used to copy data from pSrcData into buffer
    3330             :  * pDstData, with stride nSrcPixelStride in the source data and
    3331             :  * stride nDstPixelStride in the destination data. Deals with the
    3332             :  * complex case, where input is complex and output is complex.
    3333             :  *
    3334             :  * @param pSrcData the source data buffer
    3335             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3336             :  *                      of interest.
    3337             :  * @param pDstData the destination buffer.
    3338             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3339             :  *                      interest.
    3340             :  * @param nWordCount the total number of pixel words to copy
    3341             :  *
    3342             :  */
    3343             : template <class Tin, class Tout>
    3344       98788 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    3345             :                                   int nSrcPixelStride,
    3346             :                                   Tout *const CPL_RESTRICT pDstData,
    3347             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    3348             : {
    3349       98788 :     decltype(nWordCount) nDstOffset = 0;
    3350       98788 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3351       98788 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3352             : 
    3353     5631239 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3354             :     {
    3355     5532446 :         const Tin *const pPixelIn =
    3356     5532446 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3357     5532446 :         Tout *const pPixelOut =
    3358     5532446 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3359             : 
    3360     5532446 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    3361     5532446 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    3362             : 
    3363     5532446 :         nDstOffset += nDstPixelStride;
    3364             :     }
    3365       98788 : }
    3366             : 
    3367             : /************************************************************************/
    3368             : /*                      GDALCopyWordsComplexOutT()                      */
    3369             : /************************************************************************/
    3370             : /**
    3371             :  * Template function, used to copy data from pSrcData into buffer
    3372             :  * pDstData, with stride nSrcPixelStride in the source data and
    3373             :  * stride nDstPixelStride in the destination data. Deals with the
    3374             :  * case where the value is real coming in, but complex going out.
    3375             :  *
    3376             :  * @param pSrcData the source data buffer
    3377             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3378             :  *                      of interest, in bytes.
    3379             :  * @param pDstData the destination buffer.
    3380             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3381             :  *                      interest, in bytes.
    3382             :  * @param nWordCount the total number of pixel words to copy
    3383             :  *
    3384             :  */
    3385             : template <class Tin, class Tout>
    3386        4762 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    3387             :                                      int nSrcPixelStride,
    3388             :                                      Tout *const CPL_RESTRICT pDstData,
    3389             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    3390             : {
    3391        4762 :     decltype(nWordCount) nDstOffset = 0;
    3392             : 
    3393        4762 :     const Tout tOutZero = static_cast<Tout>(0);
    3394             : 
    3395        4762 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3396        4762 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3397             : 
    3398     1190408 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3399             :     {
    3400     1185646 :         const Tin tValue =
    3401     1185646 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3402     1185646 :         Tout *const pPixelOut =
    3403     1185646 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3404     1185646 :         GDALCopyWord(tValue, *pPixelOut);
    3405             : 
    3406     1185646 :         pPixelOut[1] = tOutZero;
    3407             : 
    3408     1185646 :         nDstOffset += nDstPixelStride;
    3409             :     }
    3410        4762 : }
    3411             : 
    3412             : /************************************************************************/
    3413             : /*                         GDALCopyWordsFromT()                         */
    3414             : /************************************************************************/
    3415             : /**
    3416             :  * Template driver function. Given the input type T, call the appropriate
    3417             :  * GDALCopyWordsT function template for the desired output type. You should
    3418             :  * never call this function directly (call GDALCopyWords instead).
    3419             :  *
    3420             :  * @param pSrcData source data buffer
    3421             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    3422             :  * @param bInComplex input is complex
    3423             :  * @param pDstData destination data buffer
    3424             :  * @param eDstType destination data type
    3425             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    3426             :  * @param nWordCount number of pixel words to be copied
    3427             :  */
    3428             : template <class T>
    3429    61292825 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    3430             :                                int nSrcPixelStride, bool bInComplex,
    3431             :                                void *CPL_RESTRICT pDstData,
    3432             :                                GDALDataType eDstType, int nDstPixelStride,
    3433             :                                GPtrDiff_t nWordCount)
    3434             : {
    3435    61292825 :     switch (eDstType)
    3436             :     {
    3437     4785549 :         case GDT_UInt8:
    3438     4785549 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3439             :                            static_cast<unsigned char *>(pDstData),
    3440             :                            nDstPixelStride, nWordCount);
    3441     4785549 :             break;
    3442        1891 :         case GDT_Int8:
    3443        1891 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3444             :                            static_cast<signed char *>(pDstData),
    3445             :                            nDstPixelStride, nWordCount);
    3446        1891 :             break;
    3447     1143544 :         case GDT_UInt16:
    3448     1143544 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3449             :                            static_cast<unsigned short *>(pDstData),
    3450             :                            nDstPixelStride, nWordCount);
    3451     1143544 :             break;
    3452     4162728 :         case GDT_Int16:
    3453     4162728 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3454             :                            static_cast<short *>(pDstData), nDstPixelStride,
    3455             :                            nWordCount);
    3456     4162728 :             break;
    3457       23084 :         case GDT_UInt32:
    3458       23084 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3459             :                            static_cast<unsigned int *>(pDstData),
    3460             :                            nDstPixelStride, nWordCount);
    3461       23084 :             break;
    3462    29460149 :         case GDT_Int32:
    3463    29460149 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3464             :                            static_cast<int *>(pDstData), nDstPixelStride,
    3465             :                            nWordCount);
    3466    29460149 :             break;
    3467        1250 :         case GDT_UInt64:
    3468        1250 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3469             :                            static_cast<std::uint64_t *>(pDstData),
    3470             :                            nDstPixelStride, nWordCount);
    3471        1250 :             break;
    3472        5957 :         case GDT_Int64:
    3473        5957 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3474             :                            static_cast<std::int64_t *>(pDstData),
    3475             :                            nDstPixelStride, nWordCount);
    3476        5957 :             break;
    3477         999 :         case GDT_Float16:
    3478         999 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3479             :                            static_cast<GFloat16 *>(pDstData), nDstPixelStride,
    3480             :                            nWordCount);
    3481         999 :             break;
    3482     4216050 :         case GDT_Float32:
    3483     4216050 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3484             :                            static_cast<float *>(pDstData), nDstPixelStride,
    3485             :                            nWordCount);
    3486     4216050 :             break;
    3487    17387964 :         case GDT_Float64:
    3488    17387964 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3489             :                            static_cast<double *>(pDstData), nDstPixelStride,
    3490             :                            nWordCount);
    3491    17387964 :             break;
    3492       94424 :         case GDT_CInt16:
    3493       94424 :             if (bInComplex)
    3494             :             {
    3495       93170 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3496             :                                       static_cast<short *>(pDstData),
    3497             :                                       nDstPixelStride, nWordCount);
    3498             :             }
    3499             :             else  // input is not complex, so we need to promote to a complex
    3500             :                   // buffer
    3501             :             {
    3502        1254 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3503             :                                          static_cast<short *>(pDstData),
    3504             :                                          nDstPixelStride, nWordCount);
    3505             :             }
    3506       94424 :             break;
    3507        1349 :         case GDT_CInt32:
    3508        1349 :             if (bInComplex)
    3509             :             {
    3510         717 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3511             :                                       static_cast<int *>(pDstData),
    3512             :                                       nDstPixelStride, nWordCount);
    3513             :             }
    3514             :             else  // input is not complex, so we need to promote to a complex
    3515             :                   // buffer
    3516             :             {
    3517         632 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3518             :                                          static_cast<int *>(pDstData),
    3519             :                                          nDstPixelStride, nWordCount);
    3520             :             }
    3521        1349 :             break;
    3522         313 :         case GDT_CFloat16:
    3523         313 :             if (bInComplex)
    3524             :             {
    3525          48 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3526             :                                       static_cast<GFloat16 *>(pDstData),
    3527             :                                       nDstPixelStride, nWordCount);
    3528             :             }
    3529             :             else  // input is not complex, so we need to promote to a complex
    3530             :                   // buffer
    3531             :             {
    3532         265 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3533             :                                          static_cast<GFloat16 *>(pDstData),
    3534             :                                          nDstPixelStride, nWordCount);
    3535             :             }
    3536         313 :             break;
    3537        3924 :         case GDT_CFloat32:
    3538        3924 :             if (bInComplex)
    3539             :             {
    3540        3115 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3541             :                                       static_cast<float *>(pDstData),
    3542             :                                       nDstPixelStride, nWordCount);
    3543             :             }
    3544             :             else  // input is not complex, so we need to promote to a complex
    3545             :                   // buffer
    3546             :             {
    3547         809 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3548             :                                          static_cast<float *>(pDstData),
    3549             :                                          nDstPixelStride, nWordCount);
    3550             :             }
    3551        3924 :             break;
    3552        3540 :         case GDT_CFloat64:
    3553        3540 :             if (bInComplex)
    3554             :             {
    3555        1738 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3556             :                                       static_cast<double *>(pDstData),
    3557             :                                       nDstPixelStride, nWordCount);
    3558             :             }
    3559             :             else  // input is not complex, so we need to promote to a complex
    3560             :                   // buffer
    3561             :             {
    3562        1802 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3563             :                                          static_cast<double *>(pDstData),
    3564             :                                          nDstPixelStride, nWordCount);
    3565             :             }
    3566        3540 :             break;
    3567           0 :         case GDT_Unknown:
    3568             :         case GDT_TypeCount:
    3569           0 :             CPLAssert(false);
    3570             :     }
    3571    61292825 : }
    3572             : 
    3573             : }  // end anonymous namespace
    3574             : 
    3575             : /************************************************************************/
    3576             : /*                         GDALReplicateWord()                          */
    3577             : /************************************************************************/
    3578             : 
    3579             : template <class T>
    3580      600405 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    3581             :                                GPtrDiff_t nWordCount)
    3582             : {
    3583      600405 :     const T valSet = *static_cast<const T *>(pDstData);
    3584      600405 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    3585             :     {
    3586      570592 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    3587    31990099 :         while (nWordCount >= 4)
    3588             :         {
    3589    31419540 :             nWordCount -= 4;
    3590    31419540 :             pDstPtr[0] = valSet;
    3591    31419540 :             pDstPtr[1] = valSet;
    3592    31419540 :             pDstPtr[2] = valSet;
    3593    31419540 :             pDstPtr[3] = valSet;
    3594    31419540 :             pDstPtr += 4;
    3595             :         }
    3596     1476627 :         while (nWordCount > 0)
    3597             :         {
    3598      906035 :             --nWordCount;
    3599      906035 :             *pDstPtr = valSet;
    3600      906035 :             pDstPtr++;
    3601             :         }
    3602             :     }
    3603             :     else
    3604             :     {
    3605       29813 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    3606     1040984 :         while (nWordCount > 0)
    3607             :         {
    3608     1011171 :             --nWordCount;
    3609     1011171 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    3610     1011171 :             pabyDstPtr += nDstPixelStride;
    3611             :         }
    3612             :     }
    3613      600405 : }
    3614             : 
    3615     1068100 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    3616             :                               GDALDataType eSrcType,
    3617             :                               void *CPL_RESTRICT pDstData,
    3618             :                               GDALDataType eDstType, int nDstPixelStride,
    3619             :                               GPtrDiff_t nWordCount)
    3620             : {
    3621             :     /* -----------------------------------------------------------------------
    3622             :      */
    3623             :     /* Special case when the source data is always the same value */
    3624             :     /* (for VRTSourcedRasterBand::IRasterIO and
    3625             :      * VRTDerivedRasterBand::IRasterIO*/
    3626             :     /*  for example) */
    3627             :     /* -----------------------------------------------------------------------
    3628             :      */
    3629             :     // Let the general translation case do the necessary conversions
    3630             :     // on the first destination element.
    3631     1068100 :     GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    3632             : 
    3633             :     // Now copy the first element to the nWordCount - 1 following destination
    3634             :     // elements.
    3635     1068100 :     nWordCount--;
    3636     1068100 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    3637             : 
    3638     1068100 :     switch (eDstType)
    3639             :     {
    3640      467605 :         case GDT_UInt8:
    3641             :         case GDT_Int8:
    3642             :         {
    3643      467605 :             if (nDstPixelStride == 1)
    3644             :             {
    3645      369687 :                 if (nWordCount > 0)
    3646      369687 :                     memset(pabyDstWord,
    3647      369687 :                            *reinterpret_cast<const GByte *>(pDstData),
    3648             :                            nWordCount);
    3649             :             }
    3650             :             else
    3651             :             {
    3652       97918 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    3653    67697100 :                 while (nWordCount > 0)
    3654             :                 {
    3655    67599200 :                     --nWordCount;
    3656    67599200 :                     *pabyDstWord = valSet;
    3657    67599200 :                     pabyDstWord += nDstPixelStride;
    3658             :                 }
    3659             :             }
    3660      467605 :             break;
    3661             :         }
    3662             : 
    3663             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    3664             :     case enum_type:                                                            \
    3665             :     {                                                                          \
    3666             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    3667             :         break;                                                                 \
    3668             :     }
    3669             : 
    3670       34513 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    3671      202455 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    3672          74 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3673      301585 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3674          41 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3675        1072 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3676           2 :             CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
    3677       52858 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3678        7805 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3679             : 
    3680             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3681             :     case enum_type:                                                            \
    3682             :     {                                                                          \
    3683             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3684             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3685             :         while (nWordCount > 0)                                                 \
    3686             :         {                                                                      \
    3687             :             --nWordCount;                                                      \
    3688             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3689             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3690             :             pabyDstWord += nDstPixelStride;                                    \
    3691             :         }                                                                      \
    3692             :         break;                                                                 \
    3693             :     }
    3694             : 
    3695         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3696         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3697           6 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
    3698         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3699         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3700             : 
    3701           0 :         case GDT_Unknown:
    3702             :         case GDT_TypeCount:
    3703           0 :             CPLAssert(false);
    3704             :     }
    3705     1068100 : }
    3706             : 
    3707             : /************************************************************************/
    3708             : /*                          GDALUnrolledCopy()                          */
    3709             : /************************************************************************/
    3710             : 
    3711             : template <class T, int srcStride, int dstStride>
    3712             : #if defined(__GNUC__) && defined(__AVX2__)
    3713             : __attribute__((optimize("tree-vectorize")))
    3714             : #endif
    3715     3000825 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3716             :                                            const T *CPL_RESTRICT pSrc,
    3717             :                                            GPtrDiff_t nIters)
    3718             : {
    3719             : #if !(defined(__GNUC__) && defined(__AVX2__))
    3720     3000825 :     if (nIters >= 16)
    3721             :     {
    3722   132814787 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3723             :         {
    3724   129934645 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3725   129934645 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3726   129934645 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3727   129934645 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3728   129934645 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3729   129934645 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3730   129934645 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3731   129934645 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3732   129934645 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3733   129934645 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3734   129934645 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3735   129934645 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3736   129934645 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3737   129934645 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3738   129934645 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3739   129934645 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3740   129934645 :             pDest += 16 * dstStride;
    3741   129934645 :             pSrc += 16 * srcStride;
    3742             :         }
    3743     2880267 :         nIters = nIters % 16;
    3744             :     }
    3745             : #else
    3746             : #pragma GCC unroll 4
    3747             : #endif
    3748     5162269 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3749             :     {
    3750     2161443 :         pDest[i * dstStride] = *pSrc;
    3751     2161443 :         pSrc += srcStride;
    3752             :     }
    3753     3000825 : }
    3754             : 
    3755             : template <class T, int srcStride, int dstStride>
    3756     3000825 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3757             :                                     const T *CPL_RESTRICT pSrc,
    3758             :                                     GPtrDiff_t nIters)
    3759             : {
    3760     3000825 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3761     3000825 : }
    3762             : 
    3763             : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                \
    3764             :     (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
    3765             : 
    3766             : template <>
    3767             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3768             :                                    const GByte *CPL_RESTRICT pSrc,
    3769             :                                    GPtrDiff_t nIters)
    3770             : {
    3771             :     if (nIters > 16)
    3772             :     {
    3773             :         // The SSSE3 variant is slightly faster than what the gcc autovectorizer
    3774             :         // generates
    3775             :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3776             :     }
    3777             :     else
    3778             :     {
    3779             :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3780             :         {
    3781             :             pDest[i] = *pSrc;
    3782             :             pSrc += 3;
    3783             :         }
    3784             :     }
    3785             : }
    3786             : 
    3787             : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
    3788             : 
    3789             : template <>
    3790      354194 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3791             :                                    const GByte *CPL_RESTRICT pSrc,
    3792             :                                    GPtrDiff_t nIters)
    3793             : {
    3794      354194 :     decltype(nIters) i = 0;
    3795      354194 :     if (nIters > 16)
    3796             :     {
    3797      194667 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3798             :         // If we were sure that there would always be 1 trailing byte, we could
    3799             :         // check against nIters - 15
    3800     2988110 :         for (; i < nIters - 16; i += 16)
    3801             :         {
    3802             :             __m128i xmm0 =
    3803     2793440 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3804             :             __m128i xmm1 =
    3805     5586890 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3806             :             // Set higher 8bit of each int16 packed word to 0
    3807     2793440 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3808     2793440 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3809             :             // Pack int16 to uint8 and merge back both vector
    3810     2793440 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3811             : 
    3812             :             // Store result
    3813     2793440 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3814             : 
    3815     2793440 :             pSrc += 2 * 16;
    3816             :         }
    3817             :     }
    3818     4633800 :     for (; i < nIters; i++)
    3819             :     {
    3820     4279610 :         pDest[i] = *pSrc;
    3821     4279610 :         pSrc += 2;
    3822             :     }
    3823      354194 : }
    3824             : 
    3825           1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
    3826             :                                             const GByte *CPL_RESTRICT pSrc,
    3827             :                                             GPtrDiff_t nIters)
    3828             : {
    3829           1 :     decltype(nIters) i = 0;
    3830           1 :     const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
    3831             :     // If we were sure that there would always be 2 trailing bytes, we could
    3832             :     // check against nIters - 15
    3833           2 :     for (; i < nIters - 16; i += 16)
    3834             :     {
    3835             :         __m128i xmm0 =
    3836           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3837             :         __m128i xmm1 =
    3838           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3839             :         __m128i xmm2 =
    3840           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3841             : 
    3842           1 :         auto xmm_mask0 = xmm_mask_ori;
    3843           1 :         auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
    3844           1 :         auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
    3845             : 
    3846           1 :         auto xmm = _mm_and_si128(xmm0, xmm_mask0);
    3847           1 :         auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
    3848             : 
    3849           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3850           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3851           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3852           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3853           2 :         xmm_res1 = _mm_or_si128(
    3854             :             xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
    3855             : 
    3856           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3857           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3858           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3859           2 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3860           1 :         xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
    3861             : 
    3862           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3863           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3864           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3865           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3866           2 :         xmm_res1 = _mm_or_si128(
    3867             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
    3868             : 
    3869           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3870           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3871           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3872           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3873           3 :         xmm_res1 = _mm_or_si128(
    3874             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
    3875           1 :         xmm = _mm_or_si128(xmm, xmm_res1);
    3876             : 
    3877           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3878           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3879           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3880             : 
    3881           2 :         xmm = _mm_or_si128(xmm,
    3882             :                            _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
    3883             : 
    3884           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3885           2 :         xmm = _mm_or_si128(xmm,
    3886             :                            _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
    3887             : 
    3888           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3889           2 :         xmm = _mm_or_si128(xmm,
    3890             :                            _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
    3891             : 
    3892           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3893           2 :         xmm = _mm_or_si128(xmm,
    3894             :                            _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
    3895             : 
    3896           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3897           2 :         xmm = _mm_or_si128(xmm,
    3898             :                            _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
    3899             : 
    3900           1 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
    3901             : 
    3902           1 :         pSrc += 3 * 16;
    3903             :     }
    3904           2 :     for (; i < nIters; i++)
    3905             :     {
    3906           1 :         pDest[i] = *pSrc;
    3907           1 :         pSrc += 3;
    3908             :     }
    3909           1 : }
    3910             : 
    3911             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3912             : 
    3913             : template <>
    3914      192265 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3915             :                                    const GByte *CPL_RESTRICT pSrc,
    3916             :                                    GPtrDiff_t nIters)
    3917             : {
    3918      192265 :     if (nIters > 16)
    3919             :     {
    3920      186142 :         if (CPLHaveRuntimeSSSE3())
    3921             :         {
    3922      186141 :             GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3923             :         }
    3924             :         else
    3925             :         {
    3926           1 :             GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    3927             :         }
    3928             :     }
    3929             :     else
    3930             :     {
    3931       20384 :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3932             :         {
    3933       14261 :             pDest[i] = *pSrc;
    3934       14261 :             pSrc += 3;
    3935             :         }
    3936             :     }
    3937      192265 : }
    3938             : 
    3939             : #else
    3940             : 
    3941             : template <>
    3942             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3943             :                                    const GByte *CPL_RESTRICT pSrc,
    3944             :                                    GPtrDiff_t nIters)
    3945             : {
    3946             :     GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    3947             : }
    3948             : #endif
    3949             : 
    3950             : template <>
    3951      332657 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    3952             :                                    const GByte *CPL_RESTRICT pSrc,
    3953             :                                    GPtrDiff_t nIters)
    3954             : {
    3955      332657 :     decltype(nIters) i = 0;
    3956      332657 :     if (nIters > 16)
    3957             :     {
    3958      327364 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    3959             :         // If we were sure that there would always be 3 trailing bytes, we could
    3960             :         // check against nIters - 15
    3961    28043500 :         for (; i < nIters - 16; i += 16)
    3962             :         {
    3963             :             __m128i xmm0 =
    3964    27716100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3965             :             __m128i xmm1 =
    3966    27716100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3967             :             __m128i xmm2 =
    3968    27716100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3969             :             __m128i xmm3 =
    3970    55432200 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    3971             :             // Set higher 24bit of each int32 packed word to 0
    3972    27716100 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3973    27716100 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3974    27716100 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    3975    27716100 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    3976             :             // Pack int32 to int16
    3977    27716100 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    3978    27716100 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    3979             :             // Pack int16 to uint8
    3980    27716100 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    3981             : 
    3982             :             // Store result
    3983    27716100 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3984             : 
    3985    27716100 :             pSrc += 4 * 16;
    3986             :         }
    3987             :     }
    3988     5048740 :     for (; i < nIters; i++)
    3989             :     {
    3990     4716080 :         pDest[i] = *pSrc;
    3991     4716080 :         pSrc += 4;
    3992             :     }
    3993      332657 : }
    3994             : #endif  // HAVE_SSE2
    3995             : 
    3996             : /************************************************************************/
    3997             : /*                            GDALFastCopy()                            */
    3998             : /************************************************************************/
    3999             : 
    4000             : template <class T>
    4001    40101500 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    4002             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    4003             :                                 GPtrDiff_t nIters)
    4004             : {
    4005    40101500 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    4006    40101500 :     if (nIters == 1)
    4007             :     {
    4008    22540480 :         *pDest = *pSrc;
    4009             :     }
    4010    17560932 :     else if (nDestStride == sizeofT)
    4011             :     {
    4012    14486989 :         if (nSrcStride == sizeofT)
    4013             :         {
    4014    13398074 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    4015             :         }
    4016     1088849 :         else if (nSrcStride == 2 * sizeofT)
    4017             :         {
    4018      357409 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    4019             :         }
    4020      731440 :         else if (nSrcStride == 3 * sizeofT)
    4021             :         {
    4022      289245 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    4023             :         }
    4024      442195 :         else if (nSrcStride == 4 * sizeofT)
    4025             :         {
    4026      336639 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    4027             :         }
    4028             :         else
    4029             :         {
    4030    17229290 :             while (nIters-- > 0)
    4031             :             {
    4032    17123750 :                 *pDest = *pSrc;
    4033    17123750 :                 pSrc += nSrcStride / sizeofT;
    4034    17123750 :                 pDest++;
    4035             :             }
    4036             :         }
    4037             :     }
    4038     3073963 :     else if (nSrcStride == sizeofT)
    4039             :     {
    4040     3060967 :         if (nDestStride == 2 * sizeofT)
    4041             :         {
    4042      151252 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    4043             :         }
    4044     2909715 :         else if (nDestStride == 3 * sizeofT)
    4045             :         {
    4046     2131771 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    4047             :         }
    4048      777937 :         else if (nDestStride == 4 * sizeofT)
    4049             :         {
    4050      613625 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    4051             :         }
    4052             :         else
    4053             :         {
    4054    17169660 :             while (nIters-- > 0)
    4055             :             {
    4056    17005410 :                 *pDest = *pSrc;
    4057    17005410 :                 pSrc++;
    4058    17005410 :                 pDest += nDestStride / sizeofT;
    4059             :             }
    4060             :         }
    4061             :     }
    4062             :     else
    4063             :     {
    4064     1220108 :         while (nIters-- > 0)
    4065             :         {
    4066     1207102 :             *pDest = *pSrc;
    4067     1207102 :             pSrc += nSrcStride / sizeofT;
    4068     1207102 :             pDest += nDestStride / sizeofT;
    4069             :         }
    4070             :     }
    4071    40101500 : }
    4072             : 
    4073             : /************************************************************************/
    4074             : /*                          GDALFastCopyByte()                          */
    4075             : /************************************************************************/
    4076             : 
    4077      326320 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    4078             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    4079             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    4080             : {
    4081      326320 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    4082             :                  nWordCount);
    4083      326320 : }
    4084             : 
    4085             : /************************************************************************/
    4086             : /*                           GDALCopyWords()                            */
    4087             : /************************************************************************/
    4088             : 
    4089             : /**
    4090             :  * Copy pixel words from buffer to buffer.
    4091             :  *
    4092             :  * @see GDALCopyWords64()
    4093             :  */
    4094    80491000 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    4095             :                                GDALDataType eSrcType, int nSrcPixelStride,
    4096             :                                void *CPL_RESTRICT pDstData,
    4097             :                                GDALDataType eDstType, int nDstPixelStride,
    4098             :                                int nWordCount)
    4099             : {
    4100    80491000 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    4101             :                     nDstPixelStride, nWordCount);
    4102    80491000 : }
    4103             : 
    4104             : /************************************************************************/
    4105             : /*                          GDALCopyWords64()                           */
    4106             : /************************************************************************/
    4107             : 
    4108             : /**
    4109             :  * Copy pixel words from buffer to buffer.
    4110             :  *
    4111             :  * This function is used to copy pixel word values from one memory buffer
    4112             :  * to another, with support for conversion between data types, and differing
    4113             :  * step factors. The data type conversion is done using the following
    4114             :  * rules:
    4115             :  * <ul>
    4116             :  * <li>Values assigned to a lower range integer type are clipped. For
    4117             :  * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
    4118             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    4119             :  * </li>
    4120             :  * <li>
    4121             :  * Assignment from floating point to integer rounds to closest integer.
    4122             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    4123             :  * smallest integer. NaN is mapped to 0.
    4124             :  * </li>
    4125             :  * <li>
    4126             :  * Assignment from non-complex to complex will result in the imaginary part
    4127             :  * being set to zero on output.
    4128             :  * </li>
    4129             :  * <li> Assignment from complex to
    4130             :  * non-complex will result in the complex portion being lost and the real
    4131             :  * component being preserved (<i>not magnitude!</i>).
    4132             :  * </li>
    4133             :  * </ul>
    4134             :  *
    4135             :  * No assumptions are made about the source or destination words occurring
    4136             :  * on word boundaries.  It is assumed that all values are in native machine
    4137             :  * byte order.
    4138             :  *
    4139             :  * @param pSrcData Pointer to source data to be converted.
    4140             :  * @param eSrcType the source data type (see GDALDataType enum)
    4141             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    4142             :  * in bytes
    4143             :  * @param pDstData Pointer to buffer where destination data should go
    4144             :  * @param eDstType the destination data type (see GDALDataType enum)
    4145             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    4146             :  * words), in bytes
    4147             :  * @param nWordCount number of words to be copied
    4148             :  *
    4149             :  * @note
    4150             :  * When adding a new data type to GDAL, you must do the following to
    4151             :  * support it properly within the GDALCopyWords function:
    4152             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    4153             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    4154             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    4155             :  *    This should call the appropriate GDALCopyWordsT template.
    4156             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    4157             :  *    above namespace. This will ensure that any conversion issues are
    4158             :  *    handled (cases like the float -> int32 case, where the min/max)
    4159             :  *    values are subject to roundoff error.
    4160             :  */
    4161             : 
    4162   116774000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    4163             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    4164             :                                  void *CPL_RESTRICT pDstData,
    4165             :                                  GDALDataType eDstType, int nDstPixelStride,
    4166             :                                  GPtrDiff_t nWordCount)
    4167             : 
    4168             : {
    4169             :     // On platforms where alignment matters, be careful
    4170   116774000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    4171   116774000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    4172   116774000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    4173             :     {
    4174           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    4175             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    4176             :                  "argument");
    4177           2 :         return;
    4178             :     }
    4179   116774000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    4180    66323000 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    4181    66323000 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    4182    66322600 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    4183    66322500 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    4184             :     {
    4185         905 :         if (eSrcType == eDstType)
    4186             :         {
    4187       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4188             :             {
    4189       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4190             :                        static_cast<const GByte *>(pSrcData) +
    4191       34000 :                            nSrcPixelStride * i,
    4192             :                        nDstDataTypeSize);
    4193             :             }
    4194             :         }
    4195             :         else
    4196             :         {
    4197         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    4198             :             {
    4199             :                 return ptr +
    4200         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    4201         210 :                         align);
    4202             :             };
    4203             : 
    4204             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    4205             :             // be sure to get correctly aligned pointer.
    4206         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    4207             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    4208             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    4209             :             GByte *pabySrcBuffer =
    4210         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    4211             :             GByte *pabyDstBuffer =
    4212         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    4213        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4214             :             {
    4215        3255 :                 memcpy(pabySrcBuffer,
    4216             :                        static_cast<const GByte *>(pSrcData) +
    4217        3255 :                            nSrcPixelStride * i,
    4218             :                        nSrcDataTypeSize);
    4219        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    4220             :                                 eDstType, 0, 1);
    4221        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4222             :                        pabyDstBuffer, nDstDataTypeSize);
    4223             :             }
    4224             :         }
    4225         905 :         return;
    4226             :     }
    4227             : 
    4228             :     // Deal with the case where we're replicating a single word into the
    4229             :     // provided buffer
    4230   116773000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    4231             :     {
    4232     1068100 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    4233             :                           nDstPixelStride, nWordCount);
    4234     1068100 :         return;
    4235             :     }
    4236             : 
    4237   115705000 :     if (eSrcType == eDstType)
    4238             :     {
    4239    54674100 :         if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
    4240             :         {
    4241    17979400 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    4242             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    4243             :                          nWordCount);
    4244    17979400 :             return;
    4245             :         }
    4246             : 
    4247    36694700 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    4248    21795700 :             (nDstPixelStride % 2) == 0)
    4249             :         {
    4250    21795700 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    4251             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    4252             :                          nWordCount);
    4253    21795700 :             return;
    4254             :         }
    4255             : 
    4256    14899000 :         if (nWordCount == 1)
    4257             :         {
    4258             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    4259             :             // Avoid false positives...
    4260             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    4261             : #else
    4262    14411900 :             if (nSrcDataTypeSize == 2)
    4263           0 :                 memcpy(pDstData, pSrcData, 2);
    4264    14411900 :             else if (nSrcDataTypeSize == 4)
    4265    13807600 :                 memcpy(pDstData, pSrcData, 4);
    4266      604283 :             else if (nSrcDataTypeSize == 8)
    4267      587678 :                 memcpy(pDstData, pSrcData, 8);
    4268             :             else /* if( eSrcType == GDT_CFloat64 ) */
    4269       16605 :                 memcpy(pDstData, pSrcData, 16);
    4270             : #endif
    4271    14411900 :             return;
    4272             :         }
    4273             : 
    4274             :         // Let memcpy() handle the case where we're copying a packed buffer
    4275             :         // of pixels.
    4276      487145 :         if (nSrcPixelStride == nDstPixelStride)
    4277             :         {
    4278      225301 :             if (nSrcPixelStride == nSrcDataTypeSize)
    4279             :             {
    4280      225233 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    4281      225233 :                 return;
    4282             :             }
    4283             :         }
    4284             :     }
    4285             : 
    4286             :     // Handle the more general case -- deals with conversion of data types
    4287             :     // directly.
    4288    61292800 :     switch (eSrcType)
    4289             :     {
    4290    20306200 :         case GDT_UInt8:
    4291    20306200 :             GDALCopyWordsFromT<unsigned char>(
    4292             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    4293             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4294    20306200 :             break;
    4295        1786 :         case GDT_Int8:
    4296        1786 :             GDALCopyWordsFromT<signed char>(
    4297             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    4298             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4299        1786 :             break;
    4300       55311 :         case GDT_UInt16:
    4301       55311 :             GDALCopyWordsFromT<unsigned short>(
    4302             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    4303             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4304       55311 :             break;
    4305     6519830 :         case GDT_Int16:
    4306     6519830 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4307             :                                       nSrcPixelStride, false, pDstData,
    4308             :                                       eDstType, nDstPixelStride, nWordCount);
    4309     6519830 :             break;
    4310        8016 :         case GDT_UInt32:
    4311        8016 :             GDALCopyWordsFromT<unsigned int>(
    4312             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    4313             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4314        8016 :             break;
    4315    12254800 :         case GDT_Int32:
    4316    12254800 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4317             :                                     nSrcPixelStride, false, pDstData, eDstType,
    4318             :                                     nDstPixelStride, nWordCount);
    4319    12254800 :             break;
    4320        2205 :         case GDT_UInt64:
    4321        2205 :             GDALCopyWordsFromT<std::uint64_t>(
    4322             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    4323             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4324        2205 :             break;
    4325       11729 :         case GDT_Int64:
    4326       11729 :             GDALCopyWordsFromT<std::int64_t>(
    4327             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    4328             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4329       11729 :             break;
    4330        1387 :         case GDT_Float16:
    4331        1387 :             GDALCopyWordsFromT<GFloat16>(
    4332             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
    4333             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4334        1387 :             break;
    4335      654936 :         case GDT_Float32:
    4336      654936 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4337             :                                       nSrcPixelStride, false, pDstData,
    4338             :                                       eDstType, nDstPixelStride, nWordCount);
    4339      654936 :             break;
    4340    20715800 :         case GDT_Float64:
    4341    20715800 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4342             :                                        nSrcPixelStride, false, pDstData,
    4343             :                                        eDstType, nDstPixelStride, nWordCount);
    4344    20715800 :             break;
    4345      478486 :         case GDT_CInt16:
    4346      478486 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4347             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4348             :                                       nDstPixelStride, nWordCount);
    4349      478486 :             break;
    4350         868 :         case GDT_CInt32:
    4351         868 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4352             :                                     nSrcPixelStride, true, pDstData, eDstType,
    4353             :                                     nDstPixelStride, nWordCount);
    4354         868 :             break;
    4355         508 :         case GDT_CFloat16:
    4356         508 :             GDALCopyWordsFromT<GFloat16>(
    4357             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
    4358             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4359         508 :             break;
    4360        2437 :         case GDT_CFloat32:
    4361        2437 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4362             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4363             :                                       nDstPixelStride, nWordCount);
    4364        2437 :             break;
    4365      278517 :         case GDT_CFloat64:
    4366      278517 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4367             :                                        nSrcPixelStride, true, pDstData,
    4368             :                                        eDstType, nDstPixelStride, nWordCount);
    4369      278517 :             break;
    4370           0 :         case GDT_Unknown:
    4371             :         case GDT_TypeCount:
    4372           0 :             CPLAssert(false);
    4373             :     }
    4374             : }
    4375             : 
    4376             : /************************************************************************/
    4377             : /*                            GDALCopyBits()                            */
    4378             : /************************************************************************/
    4379             : 
    4380             : /**
    4381             :  * Bitwise word copying.
    4382             :  *
    4383             :  * A function for moving sets of partial bytes around.  Loosely
    4384             :  * speaking this is a bitwise analog to GDALCopyWords().
    4385             :  *
    4386             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    4387             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    4388             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    4389             :  * and nDstOffset are the offset into the source and destination buffers
    4390             :  * to start at, also measured in bits.
    4391             :  *
    4392             :  * All bit offsets are assumed to start from the high order bit in a byte
    4393             :  * (i.e. most significant bit first).  Currently this function is not very
    4394             :  * optimized, but it may be improved for some common cases in the future
    4395             :  * as needed.
    4396             :  *
    4397             :  * @param pabySrcData the source data buffer.
    4398             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    4399             :  * first word to copy.
    4400             :  * @param nSrcStep the offset in bits from the start one source word to the
    4401             :  * start of the next.
    4402             :  * @param pabyDstData the destination data buffer.
    4403             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    4404             :  * first word to copy over.
    4405             :  * @param nDstStep the offset in bits from the start one word to the
    4406             :  * start of the next.
    4407             :  * @param nBitCount the number of bits in a word to be copied.
    4408             :  * @param nStepCount the number of words to copy.
    4409             :  */
    4410             : 
    4411           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    4412             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    4413             :                   int nBitCount, int nStepCount)
    4414             : 
    4415             : {
    4416           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    4417             : 
    4418           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    4419             :     {
    4420           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    4421             :         {
    4422           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    4423           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    4424             :             else
    4425           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    4426             : 
    4427           0 :             nSrcOffset++;
    4428           0 :             nDstOffset++;
    4429             :         }
    4430             : 
    4431           0 :         nSrcOffset += (nSrcStep - nBitCount);
    4432           0 :         nDstOffset += (nDstStep - nBitCount);
    4433             :     }
    4434             : }
    4435             : 
    4436             : /************************************************************************/
    4437             : /*                    GDALGetBestOverviewLevel()                        */
    4438             : /*                                                                      */
    4439             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4440             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    4441             : /* returning a valid overview level                                     */
    4442             : /************************************************************************/
    4443             : 
    4444           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    4445             :                                  int &nXSize, int &nYSize, int nBufXSize,
    4446             :                                  int nBufYSize)
    4447             : {
    4448           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    4449           0 :                                          nBufXSize, nBufYSize, nullptr);
    4450             : }
    4451             : 
    4452      524017 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    4453             :                                   int &nYOff, int &nXSize, int &nYSize,
    4454             :                                   int nBufXSize, int nBufYSize,
    4455             :                                   GDALRasterIOExtraArg *psExtraArg)
    4456             : {
    4457      524017 :     if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
    4458      524017 :         psExtraArg->bUseOnlyThisScale)
    4459         109 :         return -1;
    4460             :     /* -------------------------------------------------------------------- */
    4461             :     /*      Compute the desired downsampling factor.  It is                 */
    4462             :     /*      based on the least reduced axis, and represents the number      */
    4463             :     /*      of source pixels to one destination pixel.                      */
    4464             :     /* -------------------------------------------------------------------- */
    4465      523908 :     const double dfDesiredDownsamplingFactor =
    4466      523908 :         ((nXSize / static_cast<double>(nBufXSize)) <
    4467      361568 :              (nYSize / static_cast<double>(nBufYSize)) ||
    4468             :          nBufYSize == 1)
    4469      752297 :             ? nXSize / static_cast<double>(nBufXSize)
    4470      133179 :             : nYSize / static_cast<double>(nBufYSize);
    4471             : 
    4472             :     /* -------------------------------------------------------------------- */
    4473             :     /*      Find the overview level that largest downsampling factor (most  */
    4474             :     /*      downsampled) that is still less than (or only a little more)    */
    4475             :     /*      downsampled than the request.                                   */
    4476             :     /* -------------------------------------------------------------------- */
    4477      523908 :     const int nOverviewCount = poBand->GetOverviewCount();
    4478      523908 :     GDALRasterBand *poBestOverview = nullptr;
    4479      523908 :     double dfBestDownsamplingFactor = 0;
    4480      523908 :     int nBestOverviewLevel = -1;
    4481             : 
    4482             :     const char *pszOversampligThreshold =
    4483      523908 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    4484             : 
    4485             :     // Note: keep this logic for overview selection in sync between
    4486             :     // gdalwarp_lib.cpp and rasterio.cpp
    4487             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    4488             :     const double dfOversamplingThreshold =
    4489     1047810 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    4490      523899 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    4491     1047800 :             ? 1.0
    4492      523908 :             : 1.2;
    4493      526604 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4494             :     {
    4495        5616 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    4496       11232 :         if (poOverview == nullptr ||
    4497       11231 :             poOverview->GetXSize() > poBand->GetXSize() ||
    4498        5615 :             poOverview->GetYSize() > poBand->GetYSize())
    4499             :         {
    4500           1 :             continue;
    4501             :         }
    4502             : 
    4503             :         // Compute downsampling factor of this overview
    4504             :         const double dfDownsamplingFactor = std::min(
    4505        5615 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    4506       11230 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    4507             : 
    4508             :         // Is it nearly the requested factor and better (lower) than
    4509             :         // the current best factor?
    4510             :         // Use an epsilon because of numerical instability.
    4511        5615 :         constexpr double EPSILON = 1e-1;
    4512        5723 :         if (dfDownsamplingFactor >=
    4513        5615 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold +
    4514        5507 :                     EPSILON ||
    4515             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    4516             :         {
    4517         108 :             continue;
    4518             :         }
    4519             : 
    4520             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    4521        5507 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    4522             : 
    4523        5507 :         if (pszResampling != nullptr &&
    4524          71 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    4525          16 :             continue;
    4526             : 
    4527             :         // OK, this is our new best overview.
    4528        5491 :         poBestOverview = poOverview;
    4529        5491 :         nBestOverviewLevel = iOverview;
    4530        5491 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    4531             : 
    4532        5491 :         if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
    4533             :             EPSILON)
    4534             :         {
    4535        2920 :             break;
    4536             :         }
    4537             :     }
    4538             : 
    4539             :     /* -------------------------------------------------------------------- */
    4540             :     /*      If we didn't find an overview that helps us, just return        */
    4541             :     /*      indicating failure and the full resolution image will be used.  */
    4542             :     /* -------------------------------------------------------------------- */
    4543      523908 :     if (nBestOverviewLevel < 0)
    4544      520915 :         return -1;
    4545             : 
    4546             :     /* -------------------------------------------------------------------- */
    4547             :     /*      Recompute the source window in terms of the selected            */
    4548             :     /*      overview.                                                       */
    4549             :     /* -------------------------------------------------------------------- */
    4550             :     const double dfXFactor =
    4551        2993 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    4552             :     const double dfYFactor =
    4553        2993 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    4554        2993 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    4555             :              poBestOverview->GetYSize());
    4556             : 
    4557        8979 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    4558        2993 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    4559        8979 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    4560        2993 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    4561        2993 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    4562        2993 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    4563        2993 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    4564           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    4565        2993 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    4566           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    4567             : 
    4568        2993 :     if (psExtraArg)
    4569             :     {
    4570        2993 :         if (psExtraArg->bFloatingPointWindowValidity)
    4571             :         {
    4572         117 :             psExtraArg->dfXOff /= dfXFactor;
    4573         117 :             psExtraArg->dfXSize /= dfXFactor;
    4574         117 :             psExtraArg->dfYOff /= dfYFactor;
    4575         117 :             psExtraArg->dfYSize /= dfYFactor;
    4576             :         }
    4577        2876 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    4578             :         {
    4579          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    4580          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    4581          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    4582          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    4583          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    4584             :         }
    4585             :     }
    4586             : 
    4587        2993 :     nXOff = nOXOff;
    4588        2993 :     nYOff = nOYOff;
    4589        2993 :     nXSize = nOXSize;
    4590        2993 :     nYSize = nOYSize;
    4591             : 
    4592        2993 :     return nBestOverviewLevel;
    4593             : }
    4594             : 
    4595             : /************************************************************************/
    4596             : /*                          OverviewRasterIO()                          */
    4597             : /*                                                                      */
    4598             : /*      Special work function to utilize available overviews to         */
    4599             : /*      more efficiently satisfy downsampled requests.  It will         */
    4600             : /*      return CE_Failure if there are no appropriate overviews         */
    4601             : /*      available but it doesn't emit any error messages.               */
    4602             : /************************************************************************/
    4603             : 
    4604             : //! @cond Doxygen_Suppress
    4605           2 : CPLErr GDALRasterBand::OverviewRasterIO(
    4606             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4607             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4608             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    4609             : 
    4610             : {
    4611             :     GDALRasterIOExtraArg sExtraArg;
    4612           2 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4613             : 
    4614           2 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    4615             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    4616           2 :     if (nOverview < 0)
    4617           1 :         return CE_Failure;
    4618             : 
    4619             :     /* -------------------------------------------------------------------- */
    4620             :     /*      Recast the call in terms of the new raster layer.               */
    4621             :     /* -------------------------------------------------------------------- */
    4622           1 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    4623           1 :     if (poOverviewBand == nullptr)
    4624           0 :         return CE_Failure;
    4625             : 
    4626           1 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4627             :                                     pData, nBufXSize, nBufYSize, eBufType,
    4628           1 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    4629             : }
    4630             : 
    4631             : /************************************************************************/
    4632             : /*                        TryOverviewRasterIO()                         */
    4633             : /************************************************************************/
    4634             : 
    4635      362428 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    4636             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4637             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4638             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    4639             :     int *pbTried)
    4640             : {
    4641      362428 :     int nXOffMod = nXOff;
    4642      362428 :     int nYOffMod = nYOff;
    4643      362428 :     int nXSizeMod = nXSize;
    4644      362428 :     int nYSizeMod = nYSize;
    4645             :     GDALRasterIOExtraArg sExtraArg;
    4646             : 
    4647      362428 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4648             : 
    4649      362428 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4650             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    4651             :         &sExtraArg);
    4652             : 
    4653      362428 :     if (iOvrLevel >= 0)
    4654             :     {
    4655          53 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    4656          53 :         if (poOverviewBand)
    4657             :         {
    4658          53 :             *pbTried = TRUE;
    4659          53 :             return poOverviewBand->RasterIO(
    4660             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    4661             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    4662          53 :                 &sExtraArg);
    4663             :         }
    4664             :     }
    4665             : 
    4666      362375 :     *pbTried = FALSE;
    4667      362375 :     return CE_None;
    4668             : }
    4669             : 
    4670             : /************************************************************************/
    4671             : /*                        TryOverviewRasterIO()                         */
    4672             : /************************************************************************/
    4673             : 
    4674      158613 : CPLErr GDALDataset::TryOverviewRasterIO(
    4675             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4676             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4677             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4678             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    4679             :     int *pbTried)
    4680             : {
    4681      158613 :     int nXOffMod = nXOff;
    4682      158613 :     int nYOffMod = nYOff;
    4683      158613 :     int nXSizeMod = nXSize;
    4684      158613 :     int nYSizeMod = nYSize;
    4685             :     GDALRasterIOExtraArg sExtraArg;
    4686      158613 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4687             : 
    4688      317226 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4689      158613 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    4690             :         nBufYSize, &sExtraArg);
    4691             : 
    4692      158655 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    4693          42 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    4694             :     {
    4695          42 :         *pbTried = TRUE;
    4696          42 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    4697             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    4698             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    4699          42 :             nLineSpace, nBandSpace, &sExtraArg);
    4700             :     }
    4701             :     else
    4702             :     {
    4703      158571 :         *pbTried = FALSE;
    4704      158571 :         return CE_None;
    4705             :     }
    4706             : }
    4707             : 
    4708             : /************************************************************************/
    4709             : /*                        GetBestOverviewLevel()                        */
    4710             : /*                                                                      */
    4711             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4712             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    4713             : /* overview level                                                       */
    4714             : /************************************************************************/
    4715             : 
    4716           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    4717             :                                            int &nYOff, int &nXSize, int &nYSize,
    4718             :                                            int nBufXSize, int nBufYSize,
    4719             :                                            int nBandCount,
    4720             :                                            const int *panBandMap,
    4721             :                                            GDALRasterIOExtraArg *psExtraArg)
    4722             : {
    4723           4 :     int nOverviewCount = 0;
    4724           4 :     GDALRasterBand *poFirstBand = nullptr;
    4725             : 
    4726             :     /* -------------------------------------------------------------------- */
    4727             :     /* Check that all bands have the same number of overviews and           */
    4728             :     /* that they have all the same size and block dimensions                */
    4729             :     /* -------------------------------------------------------------------- */
    4730          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4731             :     {
    4732           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    4733           8 :         if (poBand == nullptr)
    4734           0 :             return -1;
    4735           8 :         if (iBand == 0)
    4736             :         {
    4737           4 :             poFirstBand = poBand;
    4738           4 :             nOverviewCount = poBand->GetOverviewCount();
    4739             :         }
    4740           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    4741             :         {
    4742           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4743             :                              "mismatched overview count, use std method.");
    4744           0 :             return -1;
    4745             :         }
    4746             :         else
    4747             :         {
    4748           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4749             :             {
    4750           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    4751             :                 GDALRasterBand *poOvrFirstBand =
    4752           0 :                     poFirstBand->GetOverview(iOverview);
    4753           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    4754           0 :                     continue;
    4755             : 
    4756           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    4757           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    4758             :                 {
    4759           0 :                     CPLDebug("GDAL",
    4760             :                              "GDALDataset::GetBestOverviewLevel() ... "
    4761             :                              "mismatched overview sizes, use std method.");
    4762           0 :                     return -1;
    4763             :                 }
    4764           0 :                 int nBlockXSizeFirst = 0;
    4765           0 :                 int nBlockYSizeFirst = 0;
    4766           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    4767             :                                              &nBlockYSizeFirst);
    4768             : 
    4769           0 :                 int nBlockXSizeCurrent = 0;
    4770           0 :                 int nBlockYSizeCurrent = 0;
    4771           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    4772             :                                         &nBlockYSizeCurrent);
    4773             : 
    4774           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    4775           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    4776             :                 {
    4777           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4778             :                                      "mismatched block sizes, use std method.");
    4779           0 :                     return -1;
    4780             :                 }
    4781             :             }
    4782             :         }
    4783             :     }
    4784           4 :     if (poFirstBand == nullptr)
    4785           0 :         return -1;
    4786             : 
    4787           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    4788             :                                          nYSize, nBufXSize, nBufYSize,
    4789           4 :                                          psExtraArg);
    4790             : }
    4791             : 
    4792             : /************************************************************************/
    4793             : /*                         BlockBasedRasterIO()                         */
    4794             : /*                                                                      */
    4795             : /*      This convenience function implements a dataset level            */
    4796             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    4797             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    4798             : /*      all bands at once, so that a format driver that handles a       */
    4799             : /*      request for different bands of the same block efficiently       */
    4800             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    4801             : /*                                                                      */
    4802             : /*      This method is intended to be called by an overridden           */
    4803             : /*      IRasterIO() method in the driver specific GDALDataset           */
    4804             : /*      derived class.                                                  */
    4805             : /*                                                                      */
    4806             : /*      Default internal implementation of RasterIO() ... utilizes      */
    4807             : /*      the Block access methods to satisfy the request.  This would    */
    4808             : /*      normally only be overridden by formats with overviews.          */
    4809             : /*                                                                      */
    4810             : /*      To keep things relatively simple, this method does not          */
    4811             : /*      currently take advantage of some special cases addressed in     */
    4812             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    4813             : /*      call it when you know it will help.  That is in cases where     */
    4814             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    4815             : /*      implementing interleaved IO efficiently on a block by block     */
    4816             : /*      basis. Overviews will be used when possible.                    */
    4817             : /************************************************************************/
    4818             : 
    4819       64982 : CPLErr GDALDataset::BlockBasedRasterIO(
    4820             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4821             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4822             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4823             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    4824             : 
    4825             : {
    4826       64982 :     CPLAssert(nullptr != pData);
    4827             : 
    4828       64982 :     GByte **papabySrcBlock = nullptr;
    4829       64982 :     GDALRasterBlock *poBlock = nullptr;
    4830       64982 :     GDALRasterBlock **papoBlocks = nullptr;
    4831       64982 :     int nLBlockX = -1;
    4832       64982 :     int nLBlockY = -1;
    4833             :     int iBufYOff;
    4834             :     int iBufXOff;
    4835       64982 :     int nBlockXSize = 1;
    4836       64982 :     int nBlockYSize = 1;
    4837       64982 :     CPLErr eErr = CE_None;
    4838       64982 :     GDALDataType eDataType = GDT_UInt8;
    4839             : 
    4840       64982 :     const bool bUseIntegerRequestCoords =
    4841       65020 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4842          38 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4843          36 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4844             : 
    4845             :     /* -------------------------------------------------------------------- */
    4846             :     /*      Ensure that all bands share a common block size and data type.  */
    4847             :     /* -------------------------------------------------------------------- */
    4848      308187 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4849             :     {
    4850      243205 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4851             : 
    4852      243205 :         if (iBand == 0)
    4853             :         {
    4854       64982 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4855       64982 :             eDataType = poBand->GetRasterDataType();
    4856             :         }
    4857             :         else
    4858             :         {
    4859      178223 :             int nThisBlockXSize = 0;
    4860      178223 :             int nThisBlockYSize = 0;
    4861      178223 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    4862      178223 :             if (nThisBlockXSize != nBlockXSize ||
    4863      178223 :                 nThisBlockYSize != nBlockYSize)
    4864             :             {
    4865           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4866             :                                  "mismatched block sizes, use std method.");
    4867           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4868             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4869             :                                          nBandCount, panBandMap, nPixelSpace,
    4870           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4871             :             }
    4872             : 
    4873      178223 :             if (eDataType != poBand->GetRasterDataType() &&
    4874           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    4875             :             {
    4876           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4877             :                                  "mismatched band data types, use std method.");
    4878           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4879             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4880             :                                          nBandCount, panBandMap, nPixelSpace,
    4881           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4882             :             }
    4883             :         }
    4884             :     }
    4885             : 
    4886             :     /* ==================================================================== */
    4887             :     /*      In this special case at full resolution we step through in      */
    4888             :     /*      blocks, turning the request over to the per-band                */
    4889             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4890             :     /*      called before proceeding to the next.                           */
    4891             :     /* ==================================================================== */
    4892             : 
    4893       64982 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4894             :     {
    4895             :         GDALRasterIOExtraArg sDummyExtraArg;
    4896       64978 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4897             : 
    4898       64978 :         int nChunkYSize = 0;
    4899       64978 :         int nChunkXSize = 0;
    4900             : 
    4901      213434 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4902             :         {
    4903      149472 :             const int nChunkYOff = iBufYOff + nYOff;
    4904      149472 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4905      149472 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4906       59977 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4907             : 
    4908      822752 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4909             :             {
    4910      674295 :                 const int nChunkXOff = iBufXOff + nXOff;
    4911      674295 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4912      674295 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4913       70691 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4914             : 
    4915      674295 :                 GByte *pabyChunkData =
    4916      674295 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4917      674295 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4918             : 
    4919     3282490 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4920             :                 {
    4921     2609210 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4922             : 
    4923     5218420 :                     eErr = poBand->IRasterIO(
    4924             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4925             :                         nChunkYSize,
    4926     2609210 :                         pabyChunkData +
    4927     2609210 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4928             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4929     2609210 :                         nLineSpace, &sDummyExtraArg);
    4930     2609210 :                     if (eErr != CE_None)
    4931        1015 :                         return eErr;
    4932             :                 }
    4933             :             }
    4934             : 
    4935      167371 :             if (psExtraArg->pfnProgress != nullptr &&
    4936       18914 :                 !psExtraArg->pfnProgress(
    4937      167371 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4938             :                         nBufYSize,
    4939             :                     "", psExtraArg->pProgressData))
    4940             :             {
    4941           1 :                 return CE_Failure;
    4942             :             }
    4943             :         }
    4944             : 
    4945       63962 :         return CE_None;
    4946             :     }
    4947             : 
    4948             :     /* Below code is not compatible with that case. It would need a complete */
    4949             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    4950           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    4951             :     {
    4952           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4953             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4954             :                                  panBandMap, nPixelSpace, nLineSpace,
    4955           0 :                                  nBandSpace, psExtraArg);
    4956             :     }
    4957             : 
    4958             :     /* We could have a smarter implementation, but that will do for now */
    4959           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    4960           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    4961             :     {
    4962           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4963             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4964             :                                  panBandMap, nPixelSpace, nLineSpace,
    4965           0 :                                  nBandSpace, psExtraArg);
    4966             :     }
    4967             : 
    4968             :     /* ==================================================================== */
    4969             :     /*      Loop reading required source blocks to satisfy output           */
    4970             :     /*      request.  This is the most general implementation.              */
    4971             :     /* ==================================================================== */
    4972             : 
    4973           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    4974             : 
    4975             :     papabySrcBlock =
    4976           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    4977             :     papoBlocks =
    4978           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    4979             : 
    4980             :     /* -------------------------------------------------------------------- */
    4981             :     /*      Select an overview level if appropriate.                        */
    4982             :     /* -------------------------------------------------------------------- */
    4983             : 
    4984             :     GDALRasterIOExtraArg sExtraArg;
    4985           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4986           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    4987             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    4988             :         panBandMap, &sExtraArg);
    4989           4 :     if (nOverviewLevel >= 0)
    4990             :     {
    4991           2 :         GetRasterBand(panBandMap[0])
    4992           2 :             ->GetOverview(nOverviewLevel)
    4993           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4994             :     }
    4995             : 
    4996           4 :     double dfXOff = nXOff;
    4997           4 :     double dfYOff = nYOff;
    4998           4 :     double dfXSize = nXSize;
    4999           4 :     double dfYSize = nYSize;
    5000           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    5001             :     {
    5002           2 :         dfXOff = sExtraArg.dfXOff;
    5003           2 :         dfYOff = sExtraArg.dfYOff;
    5004           2 :         dfXSize = sExtraArg.dfXSize;
    5005           2 :         dfYSize = sExtraArg.dfYSize;
    5006             :     }
    5007             : 
    5008             :     /* -------------------------------------------------------------------- */
    5009             :     /*      Compute stepping increment.                                     */
    5010             :     /* -------------------------------------------------------------------- */
    5011           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    5012           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    5013             : 
    5014           4 :     constexpr double EPS = 1e-10;
    5015             :     /* -------------------------------------------------------------------- */
    5016             :     /*      Loop over buffer computing source locations.                    */
    5017             :     /* -------------------------------------------------------------------- */
    5018          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    5019             :     {
    5020             :         GPtrDiff_t iSrcOffset;
    5021             : 
    5022             :         // Add small epsilon to avoid some numeric precision issues.
    5023          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    5024          32 :         const int iSrcY = static_cast<int>(std::min(
    5025          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    5026             : 
    5027          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    5028             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    5029             : 
    5030         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    5031             :         {
    5032         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    5033         270 :             const int iSrcX = static_cast<int>(std::min(
    5034         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    5035             : 
    5036             :             // FIXME: this code likely doesn't work if the dirty block gets
    5037             :             // flushed to disk before being completely written. In the meantime,
    5038             :             // bJustInitialize should probably be set to FALSE even if it is not
    5039             :             // ideal performance wise, and for lossy compression
    5040             : 
    5041             :             /* --------------------------------------------------------------------
    5042             :              */
    5043             :             /*      Ensure we have the appropriate block loaded. */
    5044             :             /* --------------------------------------------------------------------
    5045             :              */
    5046         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    5047         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    5048         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    5049         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    5050             :             {
    5051           4 :                 nLBlockX = iSrcX / nBlockXSize;
    5052           4 :                 nLBlockY = iSrcY / nBlockYSize;
    5053             : 
    5054           4 :                 const bool bJustInitialize =
    5055           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    5056           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    5057           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    5058           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    5059             :                 /*bool bMemZeroBuffer = FALSE;
    5060             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    5061             :                     nXOff <= nLBlockX * nBlockXSize &&
    5062             :                     nYOff <= nLBlockY * nBlockYSize &&
    5063             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    5064             :                      (nXOff + nXSize == GetRasterXSize() &&
    5065             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    5066             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    5067             :                      (nYOff + nYSize == GetRasterYSize() &&
    5068             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    5069             :                 {
    5070             :                     bJustInitialize = TRUE;
    5071             :                     bMemZeroBuffer = TRUE;
    5072             :                 }*/
    5073          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    5074             :                 {
    5075           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    5076           8 :                     if (nOverviewLevel >= 0)
    5077           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    5078          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    5079           8 :                                                         bJustInitialize);
    5080           8 :                     if (poBlock == nullptr)
    5081             :                     {
    5082           0 :                         eErr = CE_Failure;
    5083           0 :                         goto CleanupAndReturn;
    5084             :                     }
    5085             : 
    5086           8 :                     if (eRWFlag == GF_Write)
    5087           0 :                         poBlock->MarkDirty();
    5088             : 
    5089           8 :                     if (papoBlocks[iBand] != nullptr)
    5090           0 :                         papoBlocks[iBand]->DropLock();
    5091             : 
    5092           8 :                     papoBlocks[iBand] = poBlock;
    5093             : 
    5094           8 :                     papabySrcBlock[iBand] =
    5095           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    5096             :                     /*if( bMemZeroBuffer )
    5097             :                     {
    5098             :                         memset(papabySrcBlock[iBand], 0,
    5099             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    5100             :                     * nBlockYSize);
    5101             :                     }*/
    5102             :                 }
    5103             :             }
    5104             : 
    5105             :             /* --------------------------------------------------------------------
    5106             :              */
    5107             :             /*      Copy over this pixel of data. */
    5108             :             /* --------------------------------------------------------------------
    5109             :              */
    5110         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    5111         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    5112         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    5113         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    5114         270 :                               nBlockXSize) *
    5115         270 :                          nBandDataSize;
    5116             : 
    5117         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    5118             :             {
    5119         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    5120         710 :                 GPtrDiff_t iBandBufOffset =
    5121         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    5122             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    5123             : 
    5124         710 :                 if (eDataType == eBufType)
    5125             :                 {
    5126         710 :                     if (eRWFlag == GF_Read)
    5127         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    5128         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    5129             :                     else
    5130           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    5131             :                                static_cast<const GByte *>(pData) +
    5132           0 :                                    iBandBufOffset,
    5133             :                                nBandDataSize);
    5134             :                 }
    5135             :                 else
    5136             :                 {
    5137             :                     /* type to type conversion ... ouch, this is expensive way
    5138             :                        of handling single words */
    5139             : 
    5140           0 :                     if (eRWFlag == GF_Read)
    5141           0 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
    5142             :                                         static_cast<GByte *>(pData) +
    5143           0 :                                             iBandBufOffset,
    5144             :                                         eBufType, 0, 1);
    5145             :                     else
    5146           0 :                         GDALCopyWords64(static_cast<const GByte *>(pData) +
    5147           0 :                                             iBandBufOffset,
    5148           0 :                                         eBufType, 0, pabySrcBlock + iSrcOffset,
    5149             :                                         eDataType, 0, 1);
    5150             :                 }
    5151             :             }
    5152             : 
    5153         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    5154             :         }
    5155             :     }
    5156             : 
    5157             :     /* -------------------------------------------------------------------- */
    5158             :     /*      CleanupAndReturn.                                               */
    5159             :     /* -------------------------------------------------------------------- */
    5160           4 : CleanupAndReturn:
    5161           4 :     CPLFree(papabySrcBlock);
    5162           4 :     if (papoBlocks != nullptr)
    5163             :     {
    5164          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    5165             :         {
    5166           8 :             if (papoBlocks[iBand] != nullptr)
    5167           8 :                 papoBlocks[iBand]->DropLock();
    5168             :         }
    5169           4 :         CPLFree(papoBlocks);
    5170             :     }
    5171             : 
    5172           4 :     return eErr;
    5173             : }
    5174             : 
    5175             : //! @endcond
    5176             : 
    5177             : /************************************************************************/
    5178             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    5179             : /************************************************************************/
    5180             : 
    5181        3376 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    5182             :                                             GDALRasterBand *poDstPrototypeBand,
    5183             :                                             int nBandCount,
    5184             :                                             int bDstIsCompressed,
    5185             :                                             int bInterleave, int *pnSwathCols,
    5186             :                                             int *pnSwathLines)
    5187             : {
    5188        3376 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5189        3376 :     int nSrcBlockXSize = 0;
    5190        3376 :     int nSrcBlockYSize = 0;
    5191        3376 :     int nBlockXSize = 0;
    5192        3376 :     int nBlockYSize = 0;
    5193             : 
    5194        3376 :     int nXSize = poSrcPrototypeBand->GetXSize();
    5195        3376 :     int nYSize = poSrcPrototypeBand->GetYSize();
    5196             : 
    5197        3376 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    5198        3376 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    5199             : 
    5200        3376 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    5201        3376 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    5202             : 
    5203        3376 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5204        3376 :     if (bInterleave)
    5205         583 :         nPixelSize *= nBandCount;
    5206             : 
    5207             :     // aim for one row of blocks.  Do not settle for less.
    5208        3376 :     int nSwathCols = nXSize;
    5209        3376 :     int nSwathLines = nMaxBlockYSize;
    5210             : 
    5211             :     const char *pszSrcCompression =
    5212        3376 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    5213        3376 :     if (pszSrcCompression == nullptr)
    5214             :     {
    5215        3356 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    5216        3356 :         if (poSrcDS)
    5217             :             pszSrcCompression =
    5218        3350 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    5219             :     }
    5220             : 
    5221             :     /* -------------------------------------------------------------------- */
    5222             :     /*      What will our swath size be?                                    */
    5223             :     /* -------------------------------------------------------------------- */
    5224             :     // When writing interleaved data in a compressed format, we want to be sure
    5225             :     // that each block will only be written once, so the swath size must not be
    5226             :     // greater than the block cache.
    5227        3376 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    5228             :     int nTargetSwathSize;
    5229        3376 :     if (pszSwathSize != nullptr)
    5230           0 :         nTargetSwathSize = static_cast<int>(
    5231           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    5232             :     else
    5233             :     {
    5234             :         // As a default, take one 1/4 of the cache size.
    5235        3376 :         nTargetSwathSize = static_cast<int>(
    5236        3376 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    5237             : 
    5238             :         // but if the minimum idal swath buf size is less, then go for it to
    5239             :         // avoid unnecessarily abusing RAM usage.
    5240             :         // but try to use 10 MB at least.
    5241        3376 :         GIntBig nIdealSwathBufSize =
    5242        3376 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    5243        3376 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    5244             : 
    5245        3376 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    5246        3376 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    5247             :         {
    5248           1 :             nMinTargetSwathSize = nTargetSwathSize;
    5249             :         }
    5250             : 
    5251        3376 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    5252        3366 :             nIdealSwathBufSize < nMinTargetSwathSize)
    5253             :         {
    5254        3363 :             nIdealSwathBufSize = nMinTargetSwathSize;
    5255             :         }
    5256             : 
    5257        3376 :         if (pszSrcCompression != nullptr &&
    5258         181 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    5259           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    5260           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    5261             :         {
    5262           2 :             nIdealSwathBufSize =
    5263           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    5264           2 :                                                  nSrcBlockYSize * nPixelSize);
    5265             :         }
    5266        3376 :         if (nTargetSwathSize > nIdealSwathBufSize)
    5267        3363 :             nTargetSwathSize = static_cast<int>(
    5268        3363 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    5269             :     }
    5270             : 
    5271        3376 :     if (nTargetSwathSize < 1000000)
    5272           8 :         nTargetSwathSize = 1000000;
    5273             : 
    5274             :     /* But let's check that  */
    5275        3597 :     if (bDstIsCompressed && bInterleave &&
    5276         221 :         nTargetSwathSize > GDALGetCacheMax64())
    5277             :     {
    5278           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    5279             :                  "When translating into a compressed interleave format, "
    5280             :                  "the block cache size (" CPL_FRMT_GIB ") "
    5281             :                  "should be at least the size of the swath (%d) "
    5282             :                  "(GDAL_SWATH_SIZE config. option)",
    5283             :                  GDALGetCacheMax64(), nTargetSwathSize);
    5284             :     }
    5285             : 
    5286             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    5287             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    5288             : 
    5289             :     // if both input and output datasets are tiled, that the tile dimensions
    5290             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    5291             :     // of input and output block dimensions.
    5292        3376 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    5293          47 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    5294          47 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    5295          47 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5296          47 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5297             :     {
    5298          47 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    5299          47 :                 nPixelSize <=
    5300          47 :             static_cast<GIntBig>(nTargetSwathSize))
    5301             :         {
    5302          47 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    5303          47 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    5304          47 :             if (nSwathCols == 0)
    5305           0 :                 nSwathCols = nMaxBlockXSize;
    5306          47 :             if (nSwathCols > nXSize)
    5307          45 :                 nSwathCols = nXSize;
    5308          47 :             nSwathLines = nMaxBlockYSize;
    5309             : 
    5310          47 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    5311          47 :                 static_cast<GIntBig>(nTargetSwathSize))
    5312             :             {
    5313           0 :                 nSwathCols = nXSize;
    5314           0 :                 nSwathLines = nBlockYSize;
    5315             :             }
    5316             :         }
    5317             :     }
    5318             : 
    5319        3376 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    5320        3376 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    5321        3376 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    5322             :     {
    5323           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    5324           1 :         if (nSwathLines == 0)
    5325           1 :             nSwathLines = 1;
    5326             : 
    5327           1 :         CPLDebug(
    5328             :             "GDAL",
    5329             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    5330             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    5331             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    5332           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    5333             :     }
    5334             :     // If we are processing single scans, try to handle several at once.
    5335             :     // If we are handling swaths already, only grow the swath if a row
    5336             :     // of blocks is substantially less than our target buffer size.
    5337        3375 :     else if (nSwathLines == 1 ||
    5338        2824 :              nMemoryPerCol * nSwathLines <
    5339        2824 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    5340             :     {
    5341        3347 :         nSwathLines = std::min(
    5342             :             nYSize,
    5343        3347 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    5344             : 
    5345             :         /* If possible try to align to source and target block height */
    5346        3347 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    5347         273 :             nSwathLines > nMaxBlockYSize &&
    5348         273 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5349         244 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5350         217 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    5351             :     }
    5352             : 
    5353        3376 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    5354           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    5355           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    5356             :     {
    5357             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    5358           2 :         if (nSwathLines < nSrcBlockYSize)
    5359             :         {
    5360           0 :             nSwathLines = nSrcBlockYSize;
    5361             : 
    5362             :             // Number of pixels that can be read/write simultaneously.
    5363           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    5364           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    5365           0 :             if (nSwathCols == 0)
    5366           0 :                 nSwathCols = nSrcBlockXSize;
    5367           0 :             if (nSwathCols > nXSize)
    5368           0 :                 nSwathCols = nXSize;
    5369             : 
    5370           0 :             CPLDebug(
    5371             :                 "GDAL",
    5372             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5373             :                 "too high block, "
    5374             :                 "use partial width at one time");
    5375             :         }
    5376           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    5377             :         {
    5378             :             /* Round on a multiple of nSrcBlockYSize */
    5379           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    5380           0 :             CPLDebug(
    5381             :                 "GDAL",
    5382             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5383             :                 "round nSwathLines to block height : %d",
    5384             :                 nSwathLines);
    5385             :         }
    5386             :     }
    5387        3374 :     else if (bDstIsCompressed)
    5388             :     {
    5389         419 :         if (nSwathLines < nBlockYSize)
    5390             :         {
    5391         146 :             nSwathLines = nBlockYSize;
    5392             : 
    5393             :             // Number of pixels that can be read/write simultaneously.
    5394         146 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    5395         146 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    5396         146 :             if (nSwathCols == 0)
    5397           0 :                 nSwathCols = nBlockXSize;
    5398         146 :             if (nSwathCols > nXSize)
    5399         146 :                 nSwathCols = nXSize;
    5400             : 
    5401         146 :             CPLDebug(
    5402             :                 "GDAL",
    5403             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5404             :                 "too high block, "
    5405             :                 "use partial width at one time");
    5406             :         }
    5407         273 :         else if ((nSwathLines % nBlockYSize) != 0)
    5408             :         {
    5409             :             // Round on a multiple of nBlockYSize.
    5410           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    5411           9 :             CPLDebug(
    5412             :                 "GDAL",
    5413             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5414             :                 "round nSwathLines to block height : %d",
    5415             :                 nSwathLines);
    5416             :         }
    5417             :     }
    5418             : 
    5419        3376 :     *pnSwathCols = nSwathCols;
    5420        3376 :     *pnSwathLines = nSwathLines;
    5421        3376 : }
    5422             : 
    5423             : /************************************************************************/
    5424             : /*                     GDALDatasetCopyWholeRaster()                     */
    5425             : /************************************************************************/
    5426             : 
    5427             : /**
    5428             :  * \brief Copy all dataset raster data.
    5429             :  *
    5430             :  * This function copies the complete raster contents of one dataset to
    5431             :  * another similarly configured dataset.  The source and destination
    5432             :  * dataset must have the same number of bands, and the same width
    5433             :  * and height.  The bands do not have to have the same data type.
    5434             :  *
    5435             :  * This function is primarily intended to support implementation of
    5436             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    5437             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    5438             :  * performing the transfer in a pixel interleaved fashion.
    5439             :  *
    5440             :  * Currently the only papszOptions value supported are :
    5441             :  * <ul>
    5442             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    5443             :  * write access pattern (this does not modify the layout of the destination
    5444             :  * data)</li>
    5445             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block
    5446             :  * sizes to achieve best compression.</li>
    5447             :  * <li>"SKIP_HOLES=YES" to skip chunks
    5448             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    5449             :  * (GDAL &gt;= 2.2)</li>
    5450             :  * </ul>
    5451             :  * More options may be supported in the future.
    5452             :  *
    5453             :  * @param hSrcDS the source dataset
    5454             :  * @param hDstDS the destination dataset
    5455             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5456             :  * @param pfnProgress progress reporting function.
    5457             :  * @param pProgressData callback data for progress function.
    5458             :  *
    5459             :  * @return CE_None on success, or CE_Failure on failure.
    5460             :  */
    5461             : 
    5462        3348 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    5463             :                                               GDALDatasetH hDstDS,
    5464             :                                               CSLConstList papszOptions,
    5465             :                                               GDALProgressFunc pfnProgress,
    5466             :                                               void *pProgressData)
    5467             : 
    5468             : {
    5469        3348 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5470        3348 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5471             : 
    5472        3348 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    5473        3348 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    5474             : 
    5475        3348 :     if (pfnProgress == nullptr)
    5476           0 :         pfnProgress = GDALDummyProgress;
    5477             : 
    5478             :     /* -------------------------------------------------------------------- */
    5479             :     /*      Confirm the datasets match in size and band counts.             */
    5480             :     /* -------------------------------------------------------------------- */
    5481        3348 :     const int nXSize = poDstDS->GetRasterXSize();
    5482        3348 :     const int nYSize = poDstDS->GetRasterYSize();
    5483        3348 :     const int nBandCount = poDstDS->GetRasterCount();
    5484             : 
    5485        3348 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    5486        6696 :         poSrcDS->GetRasterYSize() != nYSize ||
    5487        3348 :         poSrcDS->GetRasterCount() != nBandCount)
    5488             :     {
    5489           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5490             :                  "Input and output dataset sizes or band counts do not\n"
    5491             :                  "match in GDALDatasetCopyWholeRaster()");
    5492           0 :         return CE_Failure;
    5493             :     }
    5494             : 
    5495             :     /* -------------------------------------------------------------------- */
    5496             :     /*      Report preliminary (0) progress.                                */
    5497             :     /* -------------------------------------------------------------------- */
    5498        3348 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5499             :     {
    5500           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5501             :                  "User terminated CreateCopy()");
    5502           1 :         return CE_Failure;
    5503             :     }
    5504             : 
    5505             :     /* -------------------------------------------------------------------- */
    5506             :     /*      Get our prototype band, and assume the others are similarly     */
    5507             :     /*      configured.                                                     */
    5508             :     /* -------------------------------------------------------------------- */
    5509        3347 :     if (nBandCount == 0)
    5510           0 :         return CE_None;
    5511             : 
    5512        3347 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    5513        3347 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    5514        3347 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5515             : 
    5516             :     /* -------------------------------------------------------------------- */
    5517             :     /*      Do we want to try and do the operation in a pixel               */
    5518             :     /*      interleaved fashion?                                            */
    5519             :     /* -------------------------------------------------------------------- */
    5520        3347 :     bool bInterleave = false;
    5521             :     const char *pszInterleave =
    5522        3347 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5523        3347 :     if (pszInterleave != nullptr &&
    5524        2943 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5525         209 :         bInterleave = true;
    5526             : 
    5527        3347 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5528        3347 :     if (pszInterleave != nullptr &&
    5529        2882 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5530         528 :         bInterleave = true;
    5531             : 
    5532        3347 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    5533        3347 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    5534           5 :         bInterleave = true;
    5535        3342 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    5536          13 :         bInterleave = false;
    5537             :     // attributes is specific to the TileDB driver
    5538        3329 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    5539           4 :         bInterleave = true;
    5540        3325 :     else if (pszInterleave != nullptr)
    5541             :     {
    5542           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    5543             :                  "Unsupported value for option INTERLEAVE");
    5544             :     }
    5545             : 
    5546             :     // If the destination is compressed, we must try to write blocks just once,
    5547             :     // to save disk space (GTiff case for example), and to avoid data loss
    5548             :     // (JPEG compression for example).
    5549        3347 :     bool bDstIsCompressed = false;
    5550             :     const char *pszDstCompressed =
    5551        3347 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    5552        3347 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5553         393 :         bDstIsCompressed = true;
    5554             : 
    5555             :     /* -------------------------------------------------------------------- */
    5556             :     /*      What will our swath size be?                                    */
    5557             :     /* -------------------------------------------------------------------- */
    5558             : 
    5559        3347 :     int nSwathCols = 0;
    5560        3347 :     int nSwathLines = 0;
    5561        3347 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    5562             :                                     nBandCount, bDstIsCompressed, bInterleave,
    5563             :                                     &nSwathCols, &nSwathLines);
    5564             : 
    5565        3347 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5566        3347 :     if (bInterleave)
    5567         583 :         nPixelSize *= nBandCount;
    5568             : 
    5569        3347 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5570        3347 :     if (pSwathBuf == nullptr)
    5571             :     {
    5572           0 :         return CE_Failure;
    5573             :     }
    5574             : 
    5575        3347 :     CPLDebug("GDAL",
    5576             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    5577             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    5578             : 
    5579             :     // Advise the source raster that we are going to read it completely
    5580             :     // Note: this might already have been done by GDALCreateCopy() in the
    5581             :     // likely case this function is indirectly called by it
    5582        3347 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    5583        3347 :                         nullptr, nullptr);
    5584             : 
    5585             :     /* ==================================================================== */
    5586             :     /*      Band oriented (uninterleaved) case.                             */
    5587             :     /* ==================================================================== */
    5588        3347 :     CPLErr eErr = CE_None;
    5589             :     const bool bCheckHoles =
    5590        3347 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5591             : 
    5592        3347 :     if (!bInterleave)
    5593             :     {
    5594             :         GDALRasterIOExtraArg sExtraArg;
    5595        2764 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5596        2764 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5597             : 
    5598        8292 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    5599        2764 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    5600        2764 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    5601        2764 :         GIntBig nBlocksDone = 0;
    5602             : 
    5603        7971 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    5604             :         {
    5605        5207 :             int nBand = iBand + 1;
    5606             : 
    5607       10677 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5608             :             {
    5609        5470 :                 int nThisLines = nSwathLines;
    5610             : 
    5611        5470 :                 if (iY + nThisLines > nYSize)
    5612         368 :                     nThisLines = nYSize - iY;
    5613             : 
    5614       10940 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    5615        5470 :                      iX += nSwathCols)
    5616             :                 {
    5617        5470 :                     int nThisCols = nSwathCols;
    5618             : 
    5619        5470 :                     if (iX + nThisCols > nXSize)
    5620           0 :                         nThisCols = nXSize - iX;
    5621             : 
    5622        5470 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5623        5470 :                     if (bCheckHoles)
    5624             :                     {
    5625             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    5626        3758 :                                       ->GetDataCoverageStatus(
    5627             :                                           iX, iY, nThisCols, nThisLines,
    5628             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    5629             :                     }
    5630        5470 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5631             :                     {
    5632        5466 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    5633       10932 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    5634        5466 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    5635        5466 :                             (nBlocksDone + 0.5) /
    5636        5466 :                                 static_cast<double>(nTotalBlocks),
    5637             :                             pfnProgress, pProgressData);
    5638        5466 :                         if (sExtraArg.pProgressData == nullptr)
    5639        1682 :                             sExtraArg.pfnProgress = nullptr;
    5640             : 
    5641        5466 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5642             :                                                  nThisLines, pSwathBuf,
    5643             :                                                  nThisCols, nThisLines, eDT, 1,
    5644             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    5645             : 
    5646        5466 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5647             : 
    5648        5466 :                         if (eErr == CE_None)
    5649        5459 :                             eErr = poDstDS->RasterIO(
    5650             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    5651             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    5652             :                                 &nBand, 0, 0, 0, nullptr);
    5653             :                     }
    5654             : 
    5655        5470 :                     nBlocksDone++;
    5656       10898 :                     if (eErr == CE_None &&
    5657        5428 :                         !pfnProgress(nBlocksDone /
    5658        5428 :                                          static_cast<double>(nTotalBlocks),
    5659             :                                      nullptr, pProgressData))
    5660             :                     {
    5661           2 :                         eErr = CE_Failure;
    5662           2 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    5663             :                                  "User terminated CreateCopy()");
    5664             :                     }
    5665             :                 }
    5666             :             }
    5667             :         }
    5668             :     }
    5669             : 
    5670             :     /* ==================================================================== */
    5671             :     /*      Pixel interleaved case.                                         */
    5672             :     /* ==================================================================== */
    5673             :     else /* if( bInterleave ) */
    5674             :     {
    5675             :         GDALRasterIOExtraArg sExtraArg;
    5676         583 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5677         583 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5678             : 
    5679         583 :         const GIntBig nTotalBlocks =
    5680         583 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    5681         583 :             DIV_ROUND_UP(nXSize, nSwathCols);
    5682         583 :         GIntBig nBlocksDone = 0;
    5683             : 
    5684        1388 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5685             :         {
    5686         805 :             int nThisLines = nSwathLines;
    5687             : 
    5688         805 :             if (iY + nThisLines > nYSize)
    5689         198 :                 nThisLines = nYSize - iY;
    5690             : 
    5691        1615 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5692             :             {
    5693         810 :                 int nThisCols = nSwathCols;
    5694             : 
    5695         810 :                 if (iX + nThisCols > nXSize)
    5696           3 :                     nThisCols = nXSize - iX;
    5697             : 
    5698         810 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5699         810 :                 if (bCheckHoles)
    5700             :                 {
    5701         551 :                     nStatus = 0;
    5702         604 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    5703             :                     {
    5704         585 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    5705         585 :                                        ->GetDataCoverageStatus(
    5706             :                                            iX, iY, nThisCols, nThisLines,
    5707             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    5708         585 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5709         532 :                             break;
    5710             :                     }
    5711             :                 }
    5712         810 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5713             :                 {
    5714         791 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    5715        1582 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    5716         791 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    5717         791 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    5718             :                         pfnProgress, pProgressData);
    5719         791 :                     if (sExtraArg.pProgressData == nullptr)
    5720         375 :                         sExtraArg.pfnProgress = nullptr;
    5721             : 
    5722         791 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5723             :                                              nThisLines, pSwathBuf, nThisCols,
    5724             :                                              nThisLines, eDT, nBandCount,
    5725             :                                              nullptr, 0, 0, 0, &sExtraArg);
    5726             : 
    5727         791 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5728             : 
    5729         791 :                     if (eErr == CE_None)
    5730         790 :                         eErr = poDstDS->RasterIO(
    5731             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    5732             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    5733             :                             0, 0, nullptr);
    5734             :                 }
    5735             : 
    5736         810 :                 nBlocksDone++;
    5737        1615 :                 if (eErr == CE_None &&
    5738         805 :                     !pfnProgress(nBlocksDone /
    5739         805 :                                      static_cast<double>(nTotalBlocks),
    5740             :                                  nullptr, pProgressData))
    5741             :                 {
    5742           1 :                     eErr = CE_Failure;
    5743           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    5744             :                              "User terminated CreateCopy()");
    5745             :                 }
    5746             :             }
    5747             :         }
    5748             :     }
    5749             : 
    5750             :     /* -------------------------------------------------------------------- */
    5751             :     /*      Cleanup                                                         */
    5752             :     /* -------------------------------------------------------------------- */
    5753        3347 :     CPLFree(pSwathBuf);
    5754             : 
    5755        3347 :     return eErr;
    5756             : }
    5757             : 
    5758             : /************************************************************************/
    5759             : /*                   GDALRasterBandCopyWholeRaster()                    */
    5760             : /************************************************************************/
    5761             : 
    5762             : /**
    5763             :  * \brief Copy a whole raster band
    5764             :  *
    5765             :  * This function copies the complete raster contents of one band to
    5766             :  * another similarly configured band.  The source and destination
    5767             :  * bands must have the same width and height.  The bands do not have
    5768             :  * to have the same data type.
    5769             :  *
    5770             :  * It implements efficient copying, in particular "chunking" the copy in
    5771             :  * substantial blocks.
    5772             :  *
    5773             :  * Currently the only papszOptions value supported are :
    5774             :  * <ul>
    5775             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    5776             :  * achieve best compression.</li>
    5777             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    5778             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    5779             :  * </ul>
    5780             :  *
    5781             :  * @param hSrcBand the source band
    5782             :  * @param hDstBand the destination band
    5783             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5784             :  * @param pfnProgress progress reporting function.
    5785             :  * @param pProgressData callback data for progress function.
    5786             :  *
    5787             :  * @return CE_None on success, or CE_Failure on failure.
    5788             :  */
    5789             : 
    5790          29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    5791             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    5792             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    5793             :     void *pProgressData)
    5794             : 
    5795             : {
    5796          29 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5797          29 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5798             : 
    5799          29 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    5800          29 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    5801          29 :     CPLErr eErr = CE_None;
    5802             : 
    5803          29 :     if (pfnProgress == nullptr)
    5804           2 :         pfnProgress = GDALDummyProgress;
    5805             : 
    5806             :     /* -------------------------------------------------------------------- */
    5807             :     /*      Confirm the datasets match in size and band counts.             */
    5808             :     /* -------------------------------------------------------------------- */
    5809          29 :     int nXSize = poSrcBand->GetXSize();
    5810          29 :     int nYSize = poSrcBand->GetYSize();
    5811             : 
    5812          29 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    5813             :     {
    5814           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5815             :                  "Input and output band sizes do not\n"
    5816             :                  "match in GDALRasterBandCopyWholeRaster()");
    5817           0 :         return CE_Failure;
    5818             :     }
    5819             : 
    5820             :     /* -------------------------------------------------------------------- */
    5821             :     /*      Report preliminary (0) progress.                                */
    5822             :     /* -------------------------------------------------------------------- */
    5823          29 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5824             :     {
    5825           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5826             :                  "User terminated CreateCopy()");
    5827           0 :         return CE_Failure;
    5828             :     }
    5829             : 
    5830          29 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5831             : 
    5832             :     // If the destination is compressed, we must try to write blocks just once,
    5833             :     // to save disk space (GTiff case for example), and to avoid data loss
    5834             :     // (JPEG compression for example).
    5835          29 :     bool bDstIsCompressed = false;
    5836             :     const char *pszDstCompressed =
    5837          29 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5838          29 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5839          26 :         bDstIsCompressed = true;
    5840             : 
    5841             :     /* -------------------------------------------------------------------- */
    5842             :     /*      What will our swath size be?                                    */
    5843             :     /* -------------------------------------------------------------------- */
    5844             : 
    5845          29 :     int nSwathCols = 0;
    5846          29 :     int nSwathLines = 0;
    5847          29 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5848             :                                     FALSE, &nSwathCols, &nSwathLines);
    5849             : 
    5850          29 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5851             : 
    5852          29 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5853          29 :     if (pSwathBuf == nullptr)
    5854             :     {
    5855           0 :         return CE_Failure;
    5856             :     }
    5857             : 
    5858          29 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5859             :              nSwathCols, nSwathLines);
    5860             : 
    5861             :     const bool bCheckHoles =
    5862          29 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5863             : 
    5864             :     // Advise the source raster that we are going to read it completely
    5865          29 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    5866             : 
    5867             :     /* ==================================================================== */
    5868             :     /*      Band oriented (uninterleaved) case.                             */
    5869             :     /* ==================================================================== */
    5870             : 
    5871          72 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5872             :     {
    5873          43 :         int nThisLines = nSwathLines;
    5874             : 
    5875          43 :         if (iY + nThisLines > nYSize)
    5876           8 :             nThisLines = nYSize - iY;
    5877             : 
    5878          86 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5879             :         {
    5880          43 :             int nThisCols = nSwathCols;
    5881             : 
    5882          43 :             if (iX + nThisCols > nXSize)
    5883           0 :                 nThisCols = nXSize - iX;
    5884             : 
    5885          43 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5886          43 :             if (bCheckHoles)
    5887             :             {
    5888           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5889             :                     iX, iY, nThisCols, nThisLines,
    5890             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5891             :             }
    5892          43 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5893             :             {
    5894          43 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5895             :                                            nThisLines, pSwathBuf, nThisCols,
    5896             :                                            nThisLines, eDT, 0, 0, nullptr);
    5897             : 
    5898          43 :                 if (eErr == CE_None)
    5899          43 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5900             :                                                nThisLines, pSwathBuf, nThisCols,
    5901             :                                                nThisLines, eDT, 0, 0, nullptr);
    5902             :             }
    5903             : 
    5904          86 :             if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
    5905          43 :                                                     static_cast<double>(nYSize),
    5906             :                                                 nullptr, pProgressData))
    5907             :             {
    5908           0 :                 eErr = CE_Failure;
    5909           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5910             :                          "User terminated CreateCopy()");
    5911             :             }
    5912             :         }
    5913             :     }
    5914             : 
    5915             :     /* -------------------------------------------------------------------- */
    5916             :     /*      Cleanup                                                         */
    5917             :     /* -------------------------------------------------------------------- */
    5918          29 :     CPLFree(pSwathBuf);
    5919             : 
    5920          29 :     return eErr;
    5921             : }
    5922             : 
    5923             : /************************************************************************/
    5924             : /*                     GDALCopyRasterIOExtraArg ()                      */
    5925             : /************************************************************************/
    5926             : 
    5927      533484 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5928             :                               const GDALRasterIOExtraArg *psSrcArg)
    5929             : {
    5930      533484 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5931      533484 :     if (psSrcArg)
    5932             :     {
    5933      533484 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5934      533484 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5935      533484 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5936      533484 :         psDestArg->bFloatingPointWindowValidity =
    5937      533484 :             psSrcArg->bFloatingPointWindowValidity;
    5938      533484 :         if (psSrcArg->bFloatingPointWindowValidity)
    5939             :         {
    5940      210512 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5941      210512 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5942      210512 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5943      210512 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5944             :         }
    5945      533484 :         if (psSrcArg->nVersion >= 2)
    5946             :         {
    5947      533484 :             psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
    5948             :         }
    5949      533484 :         if (psSrcArg->nVersion >= 3)
    5950             :         {
    5951      533484 :             psDestArg->bOperateInBufType = psSrcArg->bOperateInBufType;
    5952             :         }
    5953             :     }
    5954      533484 : }
    5955             : 
    5956             : /************************************************************************/
    5957             : /*                           HasOnlyNoData()                            */
    5958             : /************************************************************************/
    5959             : 
    5960    51285976 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    5961             : {
    5962    51285976 :     return value == noDataValue;
    5963             : }
    5964             : 
    5965        5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
    5966             : {
    5967             :     using std::isnan;
    5968        5509 :     return isnan(noDataValue) ? isnan(value) : value == noDataValue;
    5969             : }
    5970             : 
    5971      251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    5972             : {
    5973      251221 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5974             : }
    5975             : 
    5976      264257 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    5977             : {
    5978      264257 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5979             : }
    5980             : 
    5981             : template <class T>
    5982       12025 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    5983             :                            size_t nHeight, size_t nLineStride,
    5984             :                            size_t nComponents)
    5985             : {
    5986             :     // Fast test: check the 4 corners and the middle pixel.
    5987       23298 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    5988             :     {
    5989       24097 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    5990       11881 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    5991       11751 :                               noDataValue) &&
    5992       11751 :               IsEqualToNoData(
    5993       11751 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    5994       11751 :                               nComponents +
    5995             :                           iBand],
    5996       11276 :                   noDataValue) &&
    5997       11276 :               IsEqualToNoData(
    5998       11276 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    5999             :                   noDataValue) &&
    6000       11276 :               IsEqualToNoData(
    6001       11276 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    6002       11276 :                               nComponents +
    6003             :                           iBand],
    6004             :                   noDataValue)))
    6005             :         {
    6006         943 :             return false;
    6007             :         }
    6008             :     }
    6009             : 
    6010             :     // Test all pixels.
    6011       52954 :     for (size_t iY = 0; iY < nHeight; iY++)
    6012             :     {
    6013       41993 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    6014    51790448 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    6015             :         {
    6016    51748615 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    6017             :             {
    6018         121 :                 return false;
    6019             :             }
    6020             :         }
    6021             :     }
    6022       10961 :     return true;
    6023             : }
    6024             : 
    6025             : /************************************************************************/
    6026             : /*                      GDALBufferHasOnlyNoData()                       */
    6027             : /************************************************************************/
    6028             : 
    6029       43912 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    6030             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    6031             :                              size_t nComponents, int nBitsPerSample,
    6032             :                              GDALBufferSampleFormat nSampleFormat)
    6033             : {
    6034             :     // In the case where the nodata is 0, we can compare several bytes at
    6035             :     // once. Select the largest natural integer type for the architecture.
    6036       43912 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6037             :         // Do not use this optimized code path for floating point numbers,
    6038             :         // as it can't detect negative zero.
    6039             :         nSampleFormat != GSF_FLOATING_POINT)
    6040             :     {
    6041       27267 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6042       27267 :         const size_t nSize =
    6043       27267 :             static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
    6044       27267 :                                      nComponents * nBitsPerSample +
    6045             :                                  7) /
    6046             :                                 8);
    6047             : #ifdef HAVE_SSE2
    6048       27267 :         size_t n = nSize;
    6049             :         // Align to 16 bytes
    6050       27330 :         while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
    6051             :         {
    6052          73 :             --n;
    6053          73 :             if (*pabyBuffer)
    6054          10 :                 return false;
    6055          63 :             pabyBuffer++;
    6056             :         }
    6057             : 
    6058       27257 :         const auto zero = _mm_setzero_si128();
    6059       27257 :         constexpr int UNROLLING = 4;
    6060     2223240 :         while (n >= UNROLLING * sizeof(zero))
    6061             :         {
    6062     2207980 :             const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6063             :                 pabyBuffer + 0 * sizeof(zero)));
    6064     2207980 :             const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6065     2207980 :                 pabyBuffer + 1 * sizeof(zero)));
    6066     2207980 :             const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6067     2207980 :                 pabyBuffer + 2 * sizeof(zero)));
    6068     2207980 :             const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6069     2207980 :                 pabyBuffer + 3 * sizeof(zero)));
    6070             :             const auto v =
    6071     6623950 :                 _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6072             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6073             :             if (!_mm_test_all_zeros(v, v))
    6074             : #else
    6075     4415970 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6076             : #endif
    6077             :             {
    6078       12002 :                 return false;
    6079             :             }
    6080     2195980 :             pabyBuffer += UNROLLING * sizeof(zero);
    6081     2195980 :             n -= UNROLLING * sizeof(zero);
    6082             :         }
    6083             : 
    6084      233676 :         while (n > 0)
    6085             :         {
    6086      218525 :             --n;
    6087      218525 :             if (*pabyBuffer)
    6088         104 :                 return false;
    6089      218421 :             pabyBuffer++;
    6090             :         }
    6091             : #else
    6092             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    6093             :         // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    6094             :         typedef std::uint64_t WordType;
    6095             : #else
    6096             :         typedef std::uint32_t WordType;
    6097             : #endif
    6098             : 
    6099             :         const size_t nInitialIters =
    6100             :             std::min(sizeof(WordType) -
    6101             :                          static_cast<size_t>(
    6102             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    6103             :                              sizeof(WordType)),
    6104             :                      nSize);
    6105             :         size_t i = 0;
    6106             :         for (; i < nInitialIters; i++)
    6107             :         {
    6108             :             if (pabyBuffer[i])
    6109             :                 return false;
    6110             :         }
    6111             :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    6112             :         {
    6113             :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    6114             :                 return false;
    6115             :         }
    6116             :         for (; i < nSize; i++)
    6117             :         {
    6118             :             if (pabyBuffer[i])
    6119             :                 return false;
    6120             :         }
    6121             : #endif
    6122       15151 :         return true;
    6123             :     }
    6124             : 
    6125             : #ifdef HAVE_SSE2
    6126       16645 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6127         708 :              nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6128             :     {
    6129         708 :         const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
    6130         708 :         const auto zero = _mm_setzero_si128();
    6131         708 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6132         708 :         const size_t n = nWidth * nHeight * nComponents;
    6133             : 
    6134         708 :         size_t i = 0;
    6135         708 :         constexpr int UNROLLING = 4;
    6136         708 :         constexpr size_t VALUES_PER_ITER =
    6137             :             UNROLLING * sizeof(zero) / sizeof(float);
    6138       24983 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    6139             :         {
    6140       24934 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6141             :                 pabyBuffer + 0 * sizeof(zero)));
    6142       24934 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6143       24934 :                 pabyBuffer + 1 * sizeof(zero)));
    6144       24934 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6145       24934 :                 pabyBuffer + 2 * sizeof(zero)));
    6146       24934 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6147       24934 :                 pabyBuffer + 3 * sizeof(zero)));
    6148       74802 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6149             :             // Clear the sign bit (makes -0.0 become +0.0)
    6150       24934 :             v = _mm_and_si128(v, signMask);
    6151             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6152             :             if (!_mm_test_all_zeros(v, v))
    6153             : #else
    6154       49868 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6155             : #endif
    6156             :             {
    6157         659 :                 return false;
    6158             :             }
    6159       24275 :             pabyBuffer += UNROLLING * sizeof(zero);
    6160             :         }
    6161             : 
    6162         304 :         for (; i < n; i++)
    6163             :         {
    6164             :             uint32_t bits;
    6165         272 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6166         272 :             pabyBuffer += sizeof(bits);
    6167         272 :             if ((bits & 0x7FFFFFFF) != 0)
    6168          17 :                 return false;
    6169             :         }
    6170             : 
    6171          32 :         return true;
    6172             :     }
    6173             : 
    6174       15937 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6175        3905 :              nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6176             :     {
    6177        3905 :         const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
    6178        3905 :         const auto zero = _mm_setzero_si128();
    6179        3905 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6180        3905 :         const size_t n = nWidth * nHeight * nComponents;
    6181             : 
    6182        3905 :         size_t i = 0;
    6183        3905 :         constexpr int UNROLLING = 4;
    6184        3905 :         constexpr size_t VALUES_PER_ITER =
    6185             :             UNROLLING * sizeof(zero) / sizeof(double);
    6186     1664570 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    6187             :         {
    6188     1660950 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6189             :                 pabyBuffer + 0 * sizeof(zero)));
    6190     1660950 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6191     1660950 :                 pabyBuffer + 1 * sizeof(zero)));
    6192     1660950 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6193     1660950 :                 pabyBuffer + 2 * sizeof(zero)));
    6194     1660950 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6195     1660950 :                 pabyBuffer + 3 * sizeof(zero)));
    6196     4982850 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6197             :             // Clear the sign bit (makes -0.0 become +0.0)
    6198     1660950 :             v = _mm_and_si128(v, signMask);
    6199             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6200             :             if (!_mm_test_all_zeros(v, v))
    6201             : #else
    6202     3321900 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6203             : #endif
    6204             :             {
    6205         289 :                 return false;
    6206             :             }
    6207     1660660 :             pabyBuffer += UNROLLING * sizeof(zero);
    6208             :         }
    6209             : 
    6210        3643 :         for (; i < n; i++)
    6211             :         {
    6212             :             uint64_t bits;
    6213          34 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6214          34 :             pabyBuffer += sizeof(bits);
    6215          34 :             if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
    6216           7 :                 return false;
    6217             :         }
    6218             : 
    6219        3609 :         return true;
    6220             :     }
    6221             : #endif
    6222             : 
    6223       12032 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    6224             :     {
    6225       22426 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    6226       11213 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    6227       11213 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    6228       11213 :                               nHeight, nLineStride, nComponents);
    6229             :     }
    6230         819 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    6231             :     {
    6232             :         // Use unsigned implementation by converting the nodatavalue to
    6233             :         // unsigned
    6234         119 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    6235          59 :                HasOnlyNoDataT(
    6236             :                    static_cast<const uint8_t *>(pBuffer),
    6237          59 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    6238          60 :                    nWidth, nHeight, nLineStride, nComponents);
    6239             :     }
    6240         759 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    6241             :     {
    6242          23 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    6243          11 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    6244          11 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    6245          12 :                               nHeight, nLineStride, nComponents);
    6246             :     }
    6247         747 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    6248             :     {
    6249             :         // Use unsigned implementation by converting the nodatavalue to
    6250             :         // unsigned
    6251         111 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    6252          55 :                HasOnlyNoDataT(
    6253             :                    static_cast<const uint16_t *>(pBuffer),
    6254          55 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    6255          56 :                    nWidth, nHeight, nLineStride, nComponents);
    6256             :     }
    6257         691 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    6258             :     {
    6259         129 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    6260          64 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    6261             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    6262          65 :                               nHeight, nLineStride, nComponents);
    6263             :     }
    6264         626 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    6265             :     {
    6266             :         // Use unsigned implementation by converting the nodatavalue to
    6267             :         // unsigned
    6268          23 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    6269          11 :                HasOnlyNoDataT(
    6270             :                    static_cast<const uint32_t *>(pBuffer),
    6271          11 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    6272          12 :                    nWidth, nHeight, nLineStride, nComponents);
    6273             :     }
    6274         614 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    6275             :     {
    6276         112 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    6277          56 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    6278             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    6279          56 :                               nHeight, nLineStride, nComponents);
    6280             :     }
    6281         558 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    6282             :     {
    6283             :         // Use unsigned implementation by converting the nodatavalue to
    6284             :         // unsigned
    6285           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    6286           0 :                HasOnlyNoDataT(
    6287             :                    static_cast<const uint64_t *>(pBuffer),
    6288           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    6289           0 :                    nWidth, nHeight, nLineStride, nComponents);
    6290             :     }
    6291         558 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
    6292             :     {
    6293         106 :         return (std::isnan(dfNoDataValue) ||
    6294         211 :                 GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
    6295         105 :                HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
    6296             :                               static_cast<GFloat16>(dfNoDataValue), nWidth,
    6297         106 :                               nHeight, nLineStride, nComponents);
    6298             :     }
    6299         452 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6300             :     {
    6301         268 :         return (std::isnan(dfNoDataValue) ||
    6302         535 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    6303         267 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    6304             :                               static_cast<float>(dfNoDataValue), nWidth,
    6305         268 :                               nHeight, nLineStride, nComponents);
    6306             :     }
    6307         184 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6308             :     {
    6309         184 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    6310             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    6311         184 :                               nComponents);
    6312             :     }
    6313           0 :     return false;
    6314             : }
    6315             : 
    6316             : #ifdef HAVE_SSE2
    6317             : 
    6318             : /************************************************************************/
    6319             : /*                       GDALDeinterleave3Byte()                        */
    6320             : /************************************************************************/
    6321             : 
    6322             : #if defined(__GNUC__) && !defined(__clang__)
    6323             : __attribute__((optimize("no-tree-vectorize")))
    6324             : #endif
    6325      380714 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6326             :                                   GByte *CPL_RESTRICT pabyDest0,
    6327             :                                   GByte *CPL_RESTRICT pabyDest1,
    6328             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6329             : #ifdef USE_NEON_OPTIMIZATIONS
    6330             : {
    6331             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6332             :                                        nIters);
    6333             : }
    6334             : #else
    6335             : {
    6336             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6337      380714 :     if (CPLHaveRuntimeSSSE3())
    6338             :     {
    6339      380712 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6340      380712 :                                            pabyDest2, nIters);
    6341             :     }
    6342             : #endif
    6343             : 
    6344           2 :     size_t i = 0;
    6345           2 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    6346           2 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    6347           2 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    6348           2 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    6349             :          sizeof(unsigned int)) == 0)
    6350             :     {
    6351             :         // Slightly better than GCC autovectorizer
    6352          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    6353             :         {
    6354          15 :             unsigned int word0 =
    6355          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    6356          15 :             unsigned int word1 =
    6357          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    6358          15 :             unsigned int word2 =
    6359          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    6360          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    6361          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    6362          15 :                 ((word2 >> 8) << 24);
    6363          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    6364          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    6365          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    6366          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    6367          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    6368          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    6369          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    6370             :         }
    6371             :     }
    6372             : #if defined(__clang__)
    6373             : #pragma clang loop vectorize(disable)
    6374             : #endif
    6375           3 :     for (; i < nIters; ++i)
    6376             :     {
    6377           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    6378           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    6379           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    6380             :     }
    6381             : }
    6382             : #endif
    6383             : 
    6384             : /************************************************************************/
    6385             : /*                       GDALDeinterleave4Byte()                        */
    6386             : /************************************************************************/
    6387             : 
    6388             : #if !defined(__GNUC__) || defined(__clang__)
    6389             : 
    6390             : /************************************************************************/
    6391             : /*                            deinterleave()                            */
    6392             : /************************************************************************/
    6393             : 
    6394             : template <bool SHIFT, bool MASK>
    6395             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    6396             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    6397             : {
    6398             :     // Set higher 24bit of each int32 packed word to 0
    6399             :     if (SHIFT)
    6400             :     {
    6401             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    6402             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    6403             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    6404             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    6405             :     }
    6406             :     __m128i xmm0;
    6407             :     __m128i xmm1;
    6408             :     __m128i xmm2;
    6409             :     __m128i xmm3;
    6410             :     if (MASK)
    6411             :     {
    6412             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    6413             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    6414             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    6415             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    6416             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    6417             :     }
    6418             :     else
    6419             :     {
    6420             :         xmm0 = xmm0_ori;
    6421             :         xmm1 = xmm1_ori;
    6422             :         xmm2 = xmm2_ori;
    6423             :         xmm3 = xmm3_ori;
    6424             :     }
    6425             :     // Pack int32 to int16
    6426             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    6427             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    6428             :     // Pack int16 to uint8
    6429             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    6430             :     return xmm0;
    6431             : }
    6432             : 
    6433             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6434             :                                   GByte *CPL_RESTRICT pabyDest0,
    6435             :                                   GByte *CPL_RESTRICT pabyDest1,
    6436             :                                   GByte *CPL_RESTRICT pabyDest2,
    6437             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6438             : #ifdef USE_NEON_OPTIMIZATIONS
    6439             : {
    6440             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6441             :                                        pabyDest3, nIters);
    6442             : }
    6443             : #else
    6444             : {
    6445             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6446             :     if (CPLHaveRuntimeSSSE3())
    6447             :     {
    6448             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6449             :                                            pabyDest2, pabyDest3, nIters);
    6450             :     }
    6451             : #endif
    6452             : 
    6453             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    6454             :     // do something slightly better.
    6455             :     size_t i = 0;
    6456             :     for (; i + 15 < nIters; i += 16)
    6457             :     {
    6458             :         __m128i xmm0_ori = _mm_loadu_si128(
    6459             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    6460             :         __m128i xmm1_ori = _mm_loadu_si128(
    6461             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    6462             :         __m128i xmm2_ori = _mm_loadu_si128(
    6463             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    6464             :         __m128i xmm3_ori = _mm_loadu_si128(
    6465             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    6466             : 
    6467             :         _mm_storeu_si128(
    6468             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    6469             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6470             :         _mm_storeu_si128(
    6471             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    6472             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6473             :         _mm_storeu_si128(
    6474             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    6475             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6476             :         _mm_storeu_si128(
    6477             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    6478             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6479             :     }
    6480             : 
    6481             : #if defined(__clang__)
    6482             : #pragma clang loop vectorize(disable)
    6483             : #endif
    6484             :     for (; i < nIters; ++i)
    6485             :     {
    6486             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6487             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6488             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6489             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6490             :     }
    6491             : }
    6492             : #endif
    6493             : #else
    6494             : // GCC autovectorizer does an excellent job
    6495       73222 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    6496             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    6497             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    6498             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6499             : {
    6500   539910000 :     for (size_t i = 0; i < nIters; ++i)
    6501             :     {
    6502   539837000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    6503   539837000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    6504   539837000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    6505   539837000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    6506             :     }
    6507       73222 : }
    6508             : #endif
    6509             : 
    6510             : #else
    6511             : 
    6512             : /************************************************************************/
    6513             : /*                       GDALDeinterleave3Byte()                        */
    6514             : /************************************************************************/
    6515             : 
    6516             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    6517             : // how to auto-vectorize
    6518             : // #if defined(__GNUC__)
    6519             : //__attribute__((optimize("tree-vectorize")))
    6520             : // #endif
    6521             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6522             :                                   GByte *CPL_RESTRICT pabyDest0,
    6523             :                                   GByte *CPL_RESTRICT pabyDest1,
    6524             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6525             : {
    6526             :     for (size_t i = 0; i < nIters; ++i)
    6527             :     {
    6528             :         pabyDest0[i] = pabySrc[3 * i + 0];
    6529             :         pabyDest1[i] = pabySrc[3 * i + 1];
    6530             :         pabyDest2[i] = pabySrc[3 * i + 2];
    6531             :     }
    6532             : }
    6533             : 
    6534             : /************************************************************************/
    6535             : /*                       GDALDeinterleave4Byte()                        */
    6536             : /************************************************************************/
    6537             : 
    6538             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    6539             : // how to auto-vectorize
    6540             : // #if defined(__GNUC__)
    6541             : //__attribute__((optimize("tree-vectorize")))
    6542             : // #endif
    6543             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6544             :                                   GByte *CPL_RESTRICT pabyDest0,
    6545             :                                   GByte *CPL_RESTRICT pabyDest1,
    6546             :                                   GByte *CPL_RESTRICT pabyDest2,
    6547             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6548             : {
    6549             :     for (size_t i = 0; i < nIters; ++i)
    6550             :     {
    6551             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6552             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6553             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6554             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6555             :     }
    6556             : }
    6557             : 
    6558             : #endif
    6559             : 
    6560             : /************************************************************************/
    6561             : /*                          GDALDeinterleave()                          */
    6562             : /************************************************************************/
    6563             : 
    6564             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    6565             :     buffers.
    6566             : 
    6567             :     In pseudo-code
    6568             :     \verbatim
    6569             :     for(size_t i = 0; i < nIters; ++i)
    6570             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    6571             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    6572             :     \endverbatim
    6573             : 
    6574             :     The implementation is optimized for a few cases, like de-interleaving
    6575             :     of 3 or 4-components Byte buffers.
    6576             : 
    6577             :     \since GDAL 3.6
    6578             :  */
    6579      454286 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    6580             :                       int nComponents, void **ppDestBuffer,
    6581             :                       GDALDataType eDestDT, size_t nIters)
    6582             : {
    6583      454286 :     if (eSourceDT == eDestDT)
    6584             :     {
    6585      454264 :         if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
    6586             :         {
    6587      453943 :             if (nComponents == 3)
    6588             :             {
    6589      380714 :                 const GByte *CPL_RESTRICT pabySrc =
    6590             :                     static_cast<const GByte *>(pSourceBuffer);
    6591      380714 :                 GByte *CPL_RESTRICT pabyDest0 =
    6592             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6593      380714 :                 GByte *CPL_RESTRICT pabyDest1 =
    6594             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6595      380714 :                 GByte *CPL_RESTRICT pabyDest2 =
    6596             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6597      380714 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6598             :                                       nIters);
    6599      380714 :                 return;
    6600             :             }
    6601       73229 :             else if (nComponents == 4)
    6602             :             {
    6603       73222 :                 const GByte *CPL_RESTRICT pabySrc =
    6604             :                     static_cast<const GByte *>(pSourceBuffer);
    6605       73222 :                 GByte *CPL_RESTRICT pabyDest0 =
    6606             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6607       73222 :                 GByte *CPL_RESTRICT pabyDest1 =
    6608             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6609       73222 :                 GByte *CPL_RESTRICT pabyDest2 =
    6610             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6611       73222 :                 GByte *CPL_RESTRICT pabyDest3 =
    6612             :                     static_cast<GByte *>(ppDestBuffer[3]);
    6613       73222 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6614             :                                       pabyDest3, nIters);
    6615       73222 :                 return;
    6616           7 :             }
    6617             :         }
    6618             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    6619             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    6620             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    6621         642 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    6622         321 :                  CPLHaveRuntimeSSSE3())
    6623             :         {
    6624         321 :             if (nComponents == 3)
    6625             :             {
    6626         126 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6627             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6628         126 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6629             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6630         126 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6631             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6632         126 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6633             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6634         126 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    6635             :                                               panDest2, nIters);
    6636         126 :                 return;
    6637             :             }
    6638             : #if !defined(__INTEL_CLANG_COMPILER)
    6639             :             // ICC autovectorizer doesn't do a good job, at least with icx
    6640             :             // 2022.1.0.20220316
    6641         195 :             else if (nComponents == 4)
    6642             :             {
    6643         195 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6644             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6645         195 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6646             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6647         195 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6648             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6649         195 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6650             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6651         195 :                 GUInt16 *CPL_RESTRICT panDest3 =
    6652             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    6653         195 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    6654             :                                               panDest2, panDest3, nIters);
    6655         195 :                 return;
    6656             :             }
    6657             : #endif
    6658             :         }
    6659             : #endif
    6660             :     }
    6661             : 
    6662          29 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    6663          29 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    6664         108 :     for (int iComp = 0; iComp < nComponents; iComp++)
    6665             :     {
    6666          79 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    6667          79 :                             iComp * nSourceDTSize,
    6668             :                         eSourceDT, nComponents * nSourceDTSize,
    6669          79 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    6670             :     }
    6671             : }
    6672             : 
    6673             : /************************************************************************/
    6674             : /*                   GDALTranspose2DSingleToSingle()                    */
    6675             : /************************************************************************/
    6676             : /**
    6677             :  * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
    6678             :  *
    6679             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6680             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6681             :  * @param nSrcWidth Width of pSrc array.
    6682             :  * @param nSrcHeight Height of pSrc array.
    6683             :  */
    6684             : 
    6685             : template <class DST, class SRC>
    6686         160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
    6687             :                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6688             :                                    size_t nSrcHeight)
    6689             : {
    6690         160 :     constexpr size_t blocksize = 32;
    6691         345 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6692             :     {
    6693         185 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6694        5016 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6695             :         {
    6696             :             // transpose the block beginning at [i,j]
    6697        4831 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6698       26185 :             for (size_t k = i; k < max_k; ++k)
    6699             :             {
    6700      669282 :                 for (size_t l = j; l < max_l; ++l)
    6701             :                 {
    6702      647928 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6703      647928 :                                  pDst[k + l * nSrcHeight]);
    6704             :                 }
    6705             :             }
    6706             :         }
    6707             :     }
    6708         160 : }
    6709             : 
    6710             : /************************************************************************/
    6711             : /*                  GDALTranspose2DComplexToComplex()                   */
    6712             : /************************************************************************/
    6713             : /**
    6714             :  * Transpose a 2D array of complex values into an array of complex values,
    6715             :  * in a efficient (cache-oblivious) way.
    6716             :  *
    6717             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6718             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6719             :  * @param nSrcWidth Width of pSrc array.
    6720             :  * @param nSrcHeight Height of pSrc array.
    6721             :  */
    6722             : template <class DST, class SRC>
    6723          25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
    6724             :                                      DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6725             :                                      size_t nSrcHeight)
    6726             : {
    6727          25 :     constexpr size_t blocksize = 32;
    6728          50 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6729             :     {
    6730          25 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6731          50 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6732             :         {
    6733             :             // transpose the block beginning at [i,j]
    6734          25 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6735          75 :             for (size_t k = i; k < max_k; ++k)
    6736             :             {
    6737         200 :                 for (size_t l = j; l < max_l; ++l)
    6738             :                 {
    6739         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6740         150 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6741         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
    6742         150 :                                  pDst[2 * (k + l * nSrcHeight) + 1]);
    6743             :                 }
    6744             :             }
    6745             :         }
    6746             :     }
    6747          25 : }
    6748             : 
    6749             : /************************************************************************/
    6750             : /*                   GDALTranspose2DComplexToSingle()                   */
    6751             : /************************************************************************/
    6752             : /**
    6753             :  * Transpose a 2D array of complex values into an array of non-complex values,
    6754             :  * in a efficient (cache-oblivious) way.
    6755             :  *
    6756             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6757             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6758             :  * @param nSrcWidth Width of pSrc array.
    6759             :  * @param nSrcHeight Height of pSrc array.
    6760             :  */
    6761             : template <class DST, class SRC>
    6762          55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
    6763             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6764             :                                     size_t nSrcHeight)
    6765             : {
    6766          55 :     constexpr size_t blocksize = 32;
    6767         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6768             :     {
    6769          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6770         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6771             :         {
    6772             :             // transpose the block beginning at [i,j]
    6773          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6774         165 :             for (size_t k = i; k < max_k; ++k)
    6775             :             {
    6776         440 :                 for (size_t l = j; l < max_l; ++l)
    6777             :                 {
    6778         330 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6779         330 :                                  pDst[k + l * nSrcHeight]);
    6780             :                 }
    6781             :             }
    6782             :         }
    6783             :     }
    6784          55 : }
    6785             : 
    6786             : /************************************************************************/
    6787             : /*                   GDALTranspose2DSingleToComplex()                   */
    6788             : /************************************************************************/
    6789             : /**
    6790             :  * Transpose a 2D array of non-complex values into an array of complex values,
    6791             :  * in a efficient (cache-oblivious) way.
    6792             :  *
    6793             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6794             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6795             :  * @param nSrcWidth Width of pSrc array.
    6796             :  * @param nSrcHeight Height of pSrc array.
    6797             :  */
    6798             : template <class DST, class SRC>
    6799          55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
    6800             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6801             :                                     size_t nSrcHeight)
    6802             : {
    6803          55 :     constexpr size_t blocksize = 32;
    6804         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6805             :     {
    6806          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6807         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6808             :         {
    6809             :             // transpose the block beginning at [i,j]
    6810          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6811         165 :             for (size_t k = i; k < max_k; ++k)
    6812             :             {
    6813         440 :                 for (size_t l = j; l < max_l; ++l)
    6814             :                 {
    6815         330 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6816         330 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6817         330 :                     pDst[2 * (k + l * nSrcHeight) + 1] = 0;
    6818             :                 }
    6819             :             }
    6820             :         }
    6821             :     }
    6822          55 : }
    6823             : 
    6824             : /************************************************************************/
    6825             : /*                          GDALTranspose2D()                           */
    6826             : /************************************************************************/
    6827             : 
    6828             : template <class DST, bool DST_IS_COMPLEX>
    6829         295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
    6830             :                             size_t nSrcWidth, size_t nSrcHeight)
    6831             : {
    6832             : #define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
    6833             :     do                                                                         \
    6834             :     {                                                                          \
    6835             :         if constexpr (DST_IS_COMPLEX)                                          \
    6836             :         {                                                                      \
    6837             :             GDALTranspose2DSingleToComplex(                                    \
    6838             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6839             :                 nSrcHeight);                                                   \
    6840             :         }                                                                      \
    6841             :         else                                                                   \
    6842             :         {                                                                      \
    6843             :             GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
    6844             :                                           pDst, nSrcWidth, nSrcHeight);        \
    6845             :         }                                                                      \
    6846             :     } while (0)
    6847             : 
    6848             : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
    6849             :     do                                                                         \
    6850             :     {                                                                          \
    6851             :         if constexpr (DST_IS_COMPLEX)                                          \
    6852             :         {                                                                      \
    6853             :             GDALTranspose2DComplexToComplex(                                   \
    6854             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6855             :                 nSrcHeight);                                                   \
    6856             :         }                                                                      \
    6857             :         else                                                                   \
    6858             :         {                                                                      \
    6859             :             GDALTranspose2DComplexToSingle(                                    \
    6860             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6861             :                 nSrcHeight);                                                   \
    6862             :         }                                                                      \
    6863             :     } while (0)
    6864             : 
    6865             :     // clang-format off
    6866         295 :     switch (eSrcType)
    6867             :     {
    6868          16 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t); break;
    6869          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
    6870          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
    6871          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
    6872          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
    6873          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
    6874          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
    6875          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
    6876          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
    6877          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
    6878          24 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
    6879          16 :         case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
    6880          16 :         case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
    6881          16 :         case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
    6882          16 :         case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
    6883          16 :         case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
    6884           0 :         case GDT_Unknown:
    6885             :         case GDT_TypeCount:
    6886           0 :             break;
    6887             :     }
    6888             :         // clang-format on
    6889             : 
    6890             : #undef CALL_GDALTranspose2D_internal
    6891             : #undef CALL_GDALTranspose2DComplex_internal
    6892         295 : }
    6893             : 
    6894             : /************************************************************************/
    6895             : /*                        GDALInterleave2Byte()                         */
    6896             : /************************************************************************/
    6897             : 
    6898             : #if defined(HAVE_SSE2) &&                                                      \
    6899             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6900             : 
    6901             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6902             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6903             : #if defined(__GNUC__)
    6904             : __attribute__((noinline))
    6905             : #endif
    6906             : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6907             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6908             : {
    6909             :     size_t i = 0;
    6910             :     constexpr size_t VALS_PER_ITER = 16;
    6911             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6912             :     {
    6913             :         __m128i xmm0 =
    6914             :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
    6915             :         __m128i xmm1 = _mm_loadu_si128(
    6916             :             reinterpret_cast<__m128i const *>(pSrc + i + nIters));
    6917             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
    6918             :                          _mm_unpacklo_epi8(xmm0, xmm1));
    6919             :         _mm_storeu_si128(
    6920             :             reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
    6921             :             _mm_unpackhi_epi8(xmm0, xmm1));
    6922             :     }
    6923             : #if defined(__clang__)
    6924             : #pragma clang loop vectorize(disable)
    6925             : #endif
    6926             :     for (; i < nIters; ++i)
    6927             :     {
    6928             :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6929             :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6930             :     }
    6931             : }
    6932             : 
    6933             : #else
    6934             : 
    6935             : #if defined(__GNUC__) && !defined(__clang__)
    6936             : __attribute__((optimize("tree-vectorize")))
    6937             : #endif
    6938             : #if defined(__GNUC__)
    6939             : __attribute__((noinline))
    6940             : #endif
    6941             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6942             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6943             : #pragma clang diagnostic push
    6944             : #pragma clang diagnostic ignored "-Wpass-failed"
    6945             : #endif
    6946           9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6947             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6948             : {
    6949             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6950             : #pragma clang loop vectorize(enable)
    6951             : #endif
    6952      355429 :     for (size_t i = 0; i < nIters; ++i)
    6953             :     {
    6954      355420 :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6955      355420 :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6956             :     }
    6957           9 : }
    6958             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6959             : #pragma clang diagnostic pop
    6960             : #endif
    6961             : 
    6962             : #endif
    6963             : 
    6964             : /************************************************************************/
    6965             : /*                        GDALInterleave4Byte()                         */
    6966             : /************************************************************************/
    6967             : 
    6968             : #if defined(HAVE_SSE2) &&                                                      \
    6969             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6970             : 
    6971             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6972             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6973             : #if defined(__GNUC__)
    6974             : __attribute__((noinline))
    6975             : #endif
    6976             : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6977             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6978             : {
    6979             :     size_t i = 0;
    6980             :     constexpr size_t VALS_PER_ITER = 16;
    6981             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6982             :     {
    6983             :         __m128i xmm0 = _mm_loadu_si128(
    6984             :             reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
    6985             :         __m128i xmm1 = _mm_loadu_si128(
    6986             :             reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
    6987             :         __m128i xmm2 = _mm_loadu_si128(
    6988             :             reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
    6989             :         __m128i xmm3 = _mm_loadu_si128(
    6990             :             reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
    6991             :         auto tmp0 = _mm_unpacklo_epi8(
    6992             :             xmm0,
    6993             :             xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
    6994             :         auto tmp1 = _mm_unpackhi_epi8(
    6995             :             xmm0,
    6996             :             xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
    6997             :         auto tmp2 = _mm_unpacklo_epi8(
    6998             :             xmm2,
    6999             :             xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
    7000             :         auto tmp3 = _mm_unpackhi_epi8(
    7001             :             xmm2,
    7002             :             xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
    7003             :         auto tmp2_0 = _mm_unpacklo_epi16(
    7004             :             tmp0,
    7005             :             tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
    7006             :         auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
    7007             :         auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
    7008             :         auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
    7009             :         _mm_storeu_si128(
    7010             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
    7011             :             tmp2_0);
    7012             :         _mm_storeu_si128(
    7013             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
    7014             :             tmp2_1);
    7015             :         _mm_storeu_si128(
    7016             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
    7017             :             tmp2_2);
    7018             :         _mm_storeu_si128(
    7019             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
    7020             :             tmp2_3);
    7021             :     }
    7022             : #if defined(__clang__)
    7023             : #pragma clang loop vectorize(disable)
    7024             : #endif
    7025             :     for (; i < nIters; ++i)
    7026             :     {
    7027             :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    7028             :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    7029             :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    7030             :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    7031             :     }
    7032             : }
    7033             : 
    7034             : #else
    7035             : 
    7036             : #if defined(__GNUC__) && !defined(__clang__)
    7037             : __attribute__((optimize("tree-vectorize")))
    7038             : #endif
    7039             : #if defined(__GNUC__)
    7040             : __attribute__((noinline))
    7041             : #endif
    7042             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7043             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    7044             : #pragma clang diagnostic push
    7045             : #pragma clang diagnostic ignored "-Wpass-failed"
    7046             : #endif
    7047          30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    7048             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    7049             : {
    7050             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7051             : #pragma clang loop vectorize(enable)
    7052             : #endif
    7053    49620700 :     for (size_t i = 0; i < nIters; ++i)
    7054             :     {
    7055    49620600 :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    7056    49620600 :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    7057    49620600 :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    7058    49620600 :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    7059             :     }
    7060          30 : }
    7061             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7062             : #pragma clang diagnostic pop
    7063             : #endif
    7064             : 
    7065             : #endif
    7066             : 
    7067             : /************************************************************************/
    7068             : /*                          GDALTranspose2D()                           */
    7069             : /************************************************************************/
    7070             : 
    7071             : /**
    7072             :  * Transpose a 2D array in a efficient (cache-oblivious) way.
    7073             :  *
    7074             :  * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
    7075             :  * @param eSrcType Data type of pSrc.
    7076             :  * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
    7077             :  * @param eDstType Data type of pDst.
    7078             :  * @param nSrcWidth Width of pSrc array.
    7079             :  * @param nSrcHeight Height of pSrc array.
    7080             :  * @since GDAL 3.11
    7081             :  */
    7082             : 
    7083         365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
    7084             :                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
    7085             : {
    7086         365 :     if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
    7087             :     {
    7088          70 :         if (nSrcHeight == 2)
    7089             :         {
    7090           9 :             GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
    7091             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    7092           9 :             return;
    7093             :         }
    7094          61 :         if (nSrcHeight == 4)
    7095             :         {
    7096          30 :             GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
    7097             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    7098          30 :             return;
    7099             :         }
    7100             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
    7101             :      (defined(__x86_64) || defined(_M_X64)))
    7102          31 :         if (CPLHaveRuntimeSSSE3())
    7103             :         {
    7104          31 :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    7105             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    7106             :                                        nSrcHeight);
    7107          31 :             return;
    7108             :         }
    7109             : #elif defined(USE_NEON_OPTIMIZATIONS)
    7110             :         {
    7111             :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    7112             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    7113             :                                        nSrcHeight);
    7114             :             return;
    7115             :         }
    7116             : #endif
    7117             :     }
    7118             : 
    7119             : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
    7120             :     GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
    7121             :         pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
    7122             : 
    7123             :     // clang-format off
    7124         295 :     switch (eDstType)
    7125             :     {
    7126          15 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
    7127          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
    7128          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
    7129          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
    7130          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
    7131          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
    7132          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
    7133          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
    7134          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
    7135          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
    7136          25 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
    7137          16 :         case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
    7138          16 :         case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
    7139          16 :         case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
    7140          16 :         case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
    7141          16 :         case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
    7142           0 :         case GDT_Unknown:
    7143             :         case GDT_TypeCount:
    7144           0 :             break;
    7145             :     }
    7146             :         // clang-format on
    7147             : 
    7148             : #undef CALL_GDALTranspose2D_internal
    7149             : }
    7150             : 
    7151             : /************************************************************************/
    7152             : /*                     ExtractBitAndConvertTo255()                      */
    7153             : /************************************************************************/
    7154             : 
    7155             : #if defined(__GNUC__) || defined(_MSC_VER)
    7156             : // Signedness of char implementation dependent, so be explicit.
    7157             : // Assumes 2-complement integer types and sign extension of right shifting
    7158             : // GCC guarantees such:
    7159             : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
    7160      143590 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    7161             : {
    7162      143590 :     return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
    7163      143590 :                               7);
    7164             : }
    7165             : #else
    7166             : // Portable way
    7167             : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    7168             : {
    7169             :     return (byVal & (1 << nBit)) ? 255 : 0;
    7170             : }
    7171             : #endif
    7172             : 
    7173             : /************************************************************************/
    7174             : /*                  ExpandEightPackedBitsToByteAt255()                  */
    7175             : /************************************************************************/
    7176             : 
    7177       17813 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
    7178             :                                                     GByte abyOutput[8])
    7179             : {
    7180       17813 :     abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
    7181       17813 :     abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
    7182       17813 :     abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
    7183       17813 :     abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
    7184       17813 :     abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
    7185       17813 :     abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
    7186       17813 :     abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
    7187       17813 :     abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
    7188       17813 : }
    7189             : 
    7190             : /************************************************************************/
    7191             : /*                 GDALExpandPackedBitsToByteAt0Or255()                 */
    7192             : /************************************************************************/
    7193             : 
    7194             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7195             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7196             :   at 1 to a byte at 255.
    7197             : 
    7198             :  The function does (in a possibly more optimized way) the following:
    7199             :  \code{.cpp}
    7200             :  for (size_t i = 0; i < nInputBits; ++i )
    7201             :  {
    7202             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
    7203             :  }
    7204             :  \endcode
    7205             : 
    7206             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7207             :  @param pabyOutput Output array of nInputBits bytes.
    7208             :  @param nInputBits Number of valid bits in pabyInput.
    7209             : 
    7210             :  @since 3.11
    7211             : */
    7212             : 
    7213       45357 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
    7214             :                                         GByte *CPL_RESTRICT pabyOutput,
    7215             :                                         size_t nInputBits)
    7216             : {
    7217       45357 :     const size_t nInputWholeBytes = nInputBits / 8;
    7218       45357 :     size_t iByte = 0;
    7219             : 
    7220             : #ifdef HAVE_SSE2
    7221             :     // Mask to isolate each bit
    7222       45357 :     const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
    7223             :                                           8, 16, 32, 64, -128);
    7224       45357 :     const __m128i zero = _mm_setzero_si128();
    7225       45357 :     const __m128i all_ones = _mm_set1_epi8(-1);
    7226             : #ifdef __SSSE3__
    7227             :     const __m128i dispatch_two_bytes =
    7228             :         _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
    7229             : #endif
    7230       45357 :     constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
    7231      135866 :     for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
    7232             :     {
    7233       90509 :         __m128i reg_ori = _mm_loadu_si128(
    7234       90509 :             reinterpret_cast<const __m128i *>(pabyInput + iByte));
    7235             : 
    7236       90509 :         constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
    7237      814581 :         for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
    7238             :         {
    7239             :             // Given reg_ori = (A, B, ... 14 other bytes ...),
    7240             :             // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
    7241             : #ifdef __SSSE3__
    7242             :             __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
    7243             : #else
    7244      724072 :             __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
    7245      724072 :             reg = _mm_unpacklo_epi16(reg, reg);
    7246      724072 :             reg = _mm_unpacklo_epi32(reg, reg);
    7247             : #endif
    7248             : 
    7249             :             // Test if bits of interest are set
    7250      724072 :             reg = _mm_and_si128(reg, bit_mask);
    7251             : 
    7252             :             // Now test if those bits are set, by comparing to zero. So the
    7253             :             // result will be that bytes where bits are set will be at 0, and
    7254             :             // ones where they are cleared will be at 0xFF. So the inverse of
    7255             :             // the end result we want!
    7256      724072 :             reg = _mm_cmpeq_epi8(reg, zero);
    7257             : 
    7258             :             // Invert the result
    7259      724072 :             reg = _mm_andnot_si128(reg, all_ones);
    7260             : 
    7261             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
    7262             : 
    7263      724072 :             pabyOutput += SSE_REG_SIZE;
    7264             : 
    7265             :             // Right-shift of 2 bytes
    7266      724072 :             reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
    7267             :         }
    7268             :     }
    7269             : 
    7270             : #endif  // HAVE_SSE2
    7271             : 
    7272       63170 :     for (; iByte < nInputWholeBytes; ++iByte)
    7273             :     {
    7274       17813 :         ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
    7275       17813 :         pabyOutput += 8;
    7276             :     }
    7277       46443 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7278             :     {
    7279        1086 :         *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
    7280        1086 :         ++pabyOutput;
    7281             :     }
    7282       45357 : }
    7283             : 
    7284             : /************************************************************************/
    7285             : /*                   ExpandEightPackedBitsToByteAt1()                   */
    7286             : /************************************************************************/
    7287             : 
    7288      136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
    7289             :                                                   GByte abyOutput[8])
    7290             : {
    7291      136113 :     abyOutput[0] = (byVal >> 7) & 0x1;
    7292      136113 :     abyOutput[1] = (byVal >> 6) & 0x1;
    7293      136113 :     abyOutput[2] = (byVal >> 5) & 0x1;
    7294      136113 :     abyOutput[3] = (byVal >> 4) & 0x1;
    7295      136113 :     abyOutput[4] = (byVal >> 3) & 0x1;
    7296      136113 :     abyOutput[5] = (byVal >> 2) & 0x1;
    7297      136113 :     abyOutput[6] = (byVal >> 1) & 0x1;
    7298      136113 :     abyOutput[7] = (byVal >> 0) & 0x1;
    7299      136113 : }
    7300             : 
    7301             : /************************************************************************/
    7302             : /*                  GDALExpandPackedBitsToByteAt0Or1()                  */
    7303             : /************************************************************************/
    7304             : 
    7305             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7306             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7307             :   at 1 to a byte at 1.
    7308             : 
    7309             :  The function does (in a possibly more optimized way) the following:
    7310             :  \code{.cpp}
    7311             :  for (size_t i = 0; i < nInputBits; ++i )
    7312             :  {
    7313             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
    7314             :  }
    7315             :  \endcode
    7316             : 
    7317             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7318             :  @param pabyOutput Output array of nInputBits bytes.
    7319             :  @param nInputBits Number of valid bits in pabyInput.
    7320             : 
    7321             :  @since 3.11
    7322             : */
    7323             : 
    7324        7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
    7325             :                                       GByte *CPL_RESTRICT pabyOutput,
    7326             :                                       size_t nInputBits)
    7327             : {
    7328        7033 :     const size_t nInputWholeBytes = nInputBits / 8;
    7329        7033 :     size_t iByte = 0;
    7330      143146 :     for (; iByte < nInputWholeBytes; ++iByte)
    7331             :     {
    7332      136113 :         ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
    7333      136113 :         pabyOutput += 8;
    7334             :     }
    7335       18886 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7336             :     {
    7337       11853 :         *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
    7338       11853 :         ++pabyOutput;
    7339             :     }
    7340        7033 : }

Generated by: LCOV version 1.14