LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2742 3019 90.8 %
Date: 2026-03-25 02:32:38 Functions: 690 730 94.5 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_float.h"
      36             : #include "cpl_progress.h"
      37             : #include "cpl_string.h"
      38             : #include "cpl_vsi.h"
      39             : #include "gdal_priv_templates.hpp"
      40             : #include "gdal_vrt.h"
      41             : #include "gdalwarper.h"
      42             : #include "memdataset.h"
      43             : #include "vrtdataset.h"
      44             : 
      45             : #if defined(__x86_64) || defined(_M_X64)
      46             : #include <emmintrin.h>
      47             : #include <immintrin.h>
      48             : #define HAVE_SSE2
      49             : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
      50             : #if defined(__GNUC__) || defined(__clang__)
      51             : #define HAVE_AVX2_DISPATCH
      52             : #elif defined(_MSC_VER)
      53             : #include <intrin.h>
      54             : #define HAVE_AVX2_DISPATCH
      55             : #define HAVE_AVX2_DISPATCH_MSVC
      56             : #elif defined(__AVX2__)
      57             : #define HAVE_AVX2_NATIVELY
      58             : #endif
      59             : #elif defined(USE_NEON_OPTIMIZATIONS)
      60             : #include "include_sse2neon.h"
      61             : #define HAVE_SSE2
      62             : #endif
      63             : 
      64             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      65             : #include "rasterio_ssse3.h"
      66             : #ifdef __SSSE3__
      67             : #include <tmmintrin.h>
      68             : #endif
      69             : #endif
      70             : 
      71             : #ifdef __SSE4_1__
      72             : #include <smmintrin.h>
      73             : #endif
      74             : 
      75             : #ifdef __GNUC__
      76             : #define CPL_NOINLINE __attribute__((noinline))
      77             : #else
      78             : #define CPL_NOINLINE
      79             : #endif
      80             : 
      81             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      82             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      83             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      84             : 
      85             : /************************************************************************/
      86             : /*                     DownsamplingIntegerXFactor()                     */
      87             : /************************************************************************/
      88             : 
      89             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      90      695780 : static bool DownsamplingIntegerXFactor(
      91             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      92             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      93             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      94             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      95             : {
      96      695780 :     const int nBandDataSize =
      97             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      98      695780 :     int nOuterLoopIters = nBufXSize - 1;
      99      695780 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
     100             :     const GByte *CPL_RESTRICT pabySrcData;
     101      695780 :     int nEndBlockX = nBlockXSize + nStartBlockX;
     102             : 
     103      695780 :     if (iSrcX < nEndBlockX)
     104             :     {
     105      294999 :         CPLAssert(poBlock);
     106      294999 :         goto no_reload_block;
     107             :     }
     108      400781 :     goto reload_block;
     109             : 
     110             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
     111             :     // nRasterXSize - 1
     112     1264973 :     while (--nOuterLoopIters >= 1)
     113             :     {
     114      201834 :         iSrcX += nSrcXInc;
     115      201834 :         pabySrcData += nIncSrcOffset;
     116      201834 :         pabyDstData += nPixelSpace;
     117             : 
     118             :         /* --------------------------------------------------------------------
     119             :          */
     120             :         /*      Ensure we have the appropriate block loaded. */
     121             :         /* --------------------------------------------------------------------
     122             :          */
     123      201834 :         if (iSrcX >= nEndBlockX)
     124             :         {
     125      201834 :         reload_block:
     126             :         {
     127      615205 :             const int nLBlockX = iSrcX / nBlockXSize;
     128      615205 :             nStartBlockX = nLBlockX * nBlockXSize;
     129      615205 :             nEndBlockX = nStartBlockX + nBlockXSize;
     130             : 
     131      615205 :             if (poBlock != nullptr)
     132      341376 :                 poBlock->DropLock();
     133             : 
     134      615205 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     135      615205 :             if (poBlock == nullptr)
     136             :             {
     137           1 :                 return false;
     138             :             }
     139             :         }
     140             : 
     141      615204 :         no_reload_block:
     142             :             const GByte *pabySrcBlock =
     143     1264973 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     144     1264973 :             GPtrDiff_t iSrcOffset =
     145     1264973 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     146     1264973 :             pabySrcData = pabySrcBlock + iSrcOffset;
     147             :         }
     148             : 
     149             :         /* --------------------------------------------------------------------
     150             :          */
     151             :         /*      Copy the maximum run of pixels. */
     152             :         /* --------------------------------------------------------------------
     153             :          */
     154             : 
     155     1264973 :         const int nIters = std::min(
     156     1264973 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     157             :         if (bSameDataType)
     158             :         {
     159     1264530 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     160     1264530 :             if (nIters > 1)
     161             :             {
     162             :                 if (DATA_TYPE_SIZE == 1)
     163             :                 {
     164      326250 :                     pabySrcData += nIncSrcOffset;
     165      326250 :                     pabyDstData += nPixelSpace;
     166      326250 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     167      326250 :                                      nPixelSpace, nIters - 1);
     168      326250 :                     pabySrcData +=
     169      326250 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     170      326250 :                     pabyDstData +=
     171      326250 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     172             :                 }
     173             :                 else
     174             :                 {
     175     4395716 :                     for (int i = 0; i < nIters - 1; i++)
     176             :                     {
     177     4197550 :                         pabySrcData += nIncSrcOffset;
     178     4197550 :                         pabyDstData += nPixelSpace;
     179     4197550 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     180             :                     }
     181             :                 }
     182      524420 :                 iSrcX += nSrcXInc * (nIters - 1);
     183      524420 :                 nOuterLoopIters -= nIters - 1;
     184             :             }
     185             :         }
     186             :         else
     187             :         {
     188             :             // Type to type conversion ...
     189         443 :             GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     190         443 :                             eBufType, nPixelSpace, std::max(1, nIters));
     191         443 :             if (nIters > 1)
     192             :             {
     193         216 :                 pabySrcData +=
     194         216 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     195         216 :                 pabyDstData +=
     196         216 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     197         216 :                 iSrcX += nSrcXInc * (nIters - 1);
     198         216 :                 nOuterLoopIters -= nIters - 1;
     199             :             }
     200             :         }
     201             :     }
     202             : 
     203             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     204     1063139 :     if (nOuterLoopIters == 0)
     205             :     {
     206      367360 :         const int nRasterXSize = poBand->GetXSize();
     207      367360 :         iSrcX =
     208      734720 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     209      367360 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     210      367360 :         pabyDstData += nPixelSpace;
     211      367360 :         if (iSrcX < nEndBlockX)
     212             :         {
     213      354770 :             goto no_reload_block;
     214             :         }
     215       12590 :         goto reload_block;
     216             :     }
     217      695779 :     return true;
     218             : }
     219             : 
     220             : template <class A, class B>
     221     2818450 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
     222             : {
     223     2818450 :     return a * b;
     224             : }
     225             : 
     226             : /************************************************************************/
     227             : /*                             IRasterIO()                              */
     228             : /*                                                                      */
     229             : /*      Default internal implementation of RasterIO() ... utilizes      */
     230             : /*      the Block access methods to satisfy the request.  This would    */
     231             : /*      normally only be overridden by formats with overviews.          */
     232             : /************************************************************************/
     233             : 
     234     6180460 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     235             :                                  int nXSize, int nYSize, void *pData,
     236             :                                  int nBufXSize, int nBufYSize,
     237             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     238             :                                  GSpacing nLineSpace,
     239             :                                  GDALRasterIOExtraArg *psExtraArg)
     240             : 
     241             : {
     242     6180460 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     243             :     {
     244           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     245             :                  "An error occurred while writing a dirty block "
     246             :                  "from GDALRasterBand::IRasterIO");
     247           0 :         CPLErr eErr = eFlushBlockErr;
     248           0 :         eFlushBlockErr = CE_None;
     249           0 :         return eErr;
     250             :     }
     251     6180460 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     252             :     {
     253           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     254           0 :         return CE_Failure;
     255             :     }
     256             : 
     257     6180460 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     258     6180460 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     259     6180460 :     GByte dummyBlock[2] = {0, 0};
     260     6180460 :     GByte *pabySrcBlock =
     261             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     262     6180460 :     GDALRasterBlock *poBlock = nullptr;
     263     6180460 :     const bool bUseIntegerRequestCoords =
     264     6545380 :         (!psExtraArg->bFloatingPointWindowValidity ||
     265      364919 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     266      339987 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     267             : 
     268             :     /* ==================================================================== */
     269             :     /*      A common case is the data requested with the destination        */
     270             :     /*      is packed, and the block width is the raster width.             */
     271             :     /* ==================================================================== */
     272     6088650 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     273     3234250 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     274    12269100 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     275             :     {
     276     3096600 :         CPLErr eErr = CE_None;
     277     3096600 :         int nLBlockY = -1;
     278             : 
     279     9750600 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     280             :         {
     281     6655090 :             const int iSrcY = iBufYOff + nYOff;
     282             : 
     283     6655090 :             if (iSrcY < nLBlockY * nBlockYSize ||
     284     6655090 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     285             :             {
     286     3365560 :                 nLBlockY = iSrcY / nBlockYSize;
     287     3365560 :                 bool bJustInitialize =
     288      297352 :                     eRWFlag == GF_Write && nXOff == 0 &&
     289     3720830 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     290       57919 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     291             : 
     292             :                 // Is this a partial tile at right and/or bottom edges of
     293             :                 // the raster, and that is going to be completely written?
     294             :                 // If so, do not load it from storage, but zero it so that
     295             :                 // the content outsize of the validity area is initialized.
     296     3365560 :                 bool bMemZeroBuffer = false;
     297      297352 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     298       24975 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     299     3663000 :                     nYOff + nYSize == GetYSize() &&
     300          89 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     301             :                 {
     302          89 :                     bJustInitialize = true;
     303          89 :                     bMemZeroBuffer = true;
     304             :                 }
     305             : 
     306     3365560 :                 if (poBlock)
     307      268959 :                     poBlock->DropLock();
     308             : 
     309     3365560 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     310     3365560 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     311     3365560 :                 if (poBlock == nullptr)
     312             :                 {
     313        1078 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     314             :                         nullptr)
     315             :                     {
     316           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     317             :                                  "GetBlockRef failed at X block offset %d, "
     318             :                                  "Y block offset %d%s",
     319             :                                  0, nLBlockY,
     320           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     321           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     322             :                                      : "");
     323             :                     }
     324        1078 :                     eErr = CE_Failure;
     325        1078 :                     break;
     326             :                 }
     327             : 
     328     3364480 :                 if (eRWFlag == GF_Write)
     329      297352 :                     poBlock->MarkDirty();
     330             : 
     331     3364480 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     332     3364480 :                 if (bMemZeroBuffer)
     333             :                 {
     334          89 :                     memset(pabySrcBlock, 0,
     335          89 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     336          89 :                                nBlockXSize * nBlockYSize);
     337             :                 }
     338             :             }
     339             : 
     340     6654010 :             const auto nSrcByteOffset =
     341     6654010 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     342     6654010 :                      nBlockXSize +
     343     6654010 :                  nXOff) *
     344     6654010 :                 nBandDataSize;
     345             : 
     346     6654010 :             if (eDataType == eBufType)
     347             :             {
     348     2990760 :                 if (eRWFlag == GF_Read)
     349     2518250 :                     memcpy(static_cast<GByte *>(pData) +
     350     2518250 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     351     2518250 :                            pabySrcBlock + nSrcByteOffset,
     352             :                            static_cast<size_t>(nLineSpace));
     353             :                 else
     354      472505 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     355      472505 :                            static_cast<GByte *>(pData) +
     356      472505 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     357             :                            static_cast<size_t>(nLineSpace));
     358             :             }
     359             :             else
     360             :             {
     361             :                 // Type to type conversion.
     362     3663250 :                 if (eRWFlag == GF_Read)
     363     3641190 :                     GDALCopyWords64(
     364     3641190 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     365             :                         static_cast<GByte *>(pData) +
     366     3641190 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     367             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     368             :                 else
     369       22065 :                     GDALCopyWords64(static_cast<GByte *>(pData) +
     370       22065 :                                         static_cast<GPtrDiff_t>(iBufYOff) *
     371             :                                             nLineSpace,
     372             :                                     eBufType, static_cast<int>(nPixelSpace),
     373       22065 :                                     pabySrcBlock + nSrcByteOffset, eDataType,
     374             :                                     nBandDataSize, nBufXSize);
     375             :             }
     376             : 
     377     6741880 :             if (psExtraArg->pfnProgress != nullptr &&
     378       87868 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     379             :                                          psExtraArg->pProgressData))
     380             :             {
     381           5 :                 eErr = CE_Failure;
     382           5 :                 break;
     383             :             }
     384             :         }
     385             : 
     386     3096600 :         if (poBlock)
     387     3095520 :             poBlock->DropLock();
     388             : 
     389     3096600 :         return eErr;
     390             :     }
     391             : 
     392             :     /* ==================================================================== */
     393             :     /*      Do we have overviews that would be appropriate to satisfy       */
     394             :     /*      this request?                                                   */
     395             :     /* ==================================================================== */
     396     3083860 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     397             :         eRWFlag == GF_Read)
     398             :     {
     399             :         GDALRasterIOExtraArg sExtraArg;
     400        2967 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     401             : 
     402             :         const int nOverview =
     403        2967 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     404             :                                           nBufXSize, nBufYSize, &sExtraArg);
     405        2967 :         if (nOverview >= 0)
     406             :         {
     407        2892 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     408        2892 :             if (poOverviewBand == nullptr)
     409        2892 :                 return CE_Failure;
     410             : 
     411        2892 :             return poOverviewBand->RasterIO(
     412             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     413        2892 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     414             :         }
     415             :     }
     416             : 
     417      891491 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     418           6 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     419     3972470 :         nLineSpace == nPixelSpace * nBufXSize &&
     420           6 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     421             :     {
     422           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     423           0 :         return CE_None;
     424             :     }
     425             : 
     426             :     /* ==================================================================== */
     427             :     /*      The second case when we don't need subsample data but likely    */
     428             :     /*      need data type conversion.                                      */
     429             :     /* ==================================================================== */
     430     3080970 :     if (  // nPixelSpace == nBufDataSize &&
     431     3080970 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     432             :     {
     433             : #if DEBUG_VERBOSE
     434             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     435             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     436             : #endif
     437             : 
     438             :         /* --------------------------------------------------------------------
     439             :          */
     440             :         /*      Loop over buffer computing source locations. */
     441             :         /* --------------------------------------------------------------------
     442             :          */
     443             :         // Calculate starting values out of loop
     444     2503120 :         const int nLBlockXStart = nXOff / nBlockXSize;
     445     2503120 :         const int nXSpanEnd = nBufXSize + nXOff;
     446             : 
     447     2503120 :         int nYInc = 0;
     448     5046990 :         for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
     449     2543870 :              iBufYOff += nYInc, iSrcY += nYInc)
     450             :         {
     451     2543940 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     452             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     453     2543940 :             int nLBlockY = iSrcY / nBlockYSize;
     454     2543940 :             int nLBlockX = nLBlockXStart;
     455     2543940 :             int iSrcX = nXOff;
     456     5362320 :             while (iSrcX < nXSpanEnd)
     457             :             {
     458     2818450 :                 int nXSpan = nLBlockX * nBlockXSize;
     459     2818450 :                 if (nXSpan < INT_MAX - nBlockXSize)
     460     2818450 :                     nXSpan += nBlockXSize;
     461             :                 else
     462           0 :                     nXSpan = INT_MAX;
     463     2818450 :                 const int nXRight = nXSpan;
     464     2818450 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     465             : 
     466             :                 const size_t nXSpanSize =
     467     2818450 :                     CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
     468             : 
     469     2818450 :                 bool bJustInitialize =
     470     2042970 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     471       38035 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     472     4887790 :                     nXOff <= nLBlockX * nBlockXSize &&
     473       26364 :                     nXOff + nXSize >= nXRight;
     474             : 
     475             :                 // Is this a partial tile at right and/or bottom edges of
     476             :                 // the raster, and that is going to be completely written?
     477             :                 // If so, do not load it from storage, but zero it so that
     478             :                 // the content outsize of the validity area is initialized.
     479     2818450 :                 bool bMemZeroBuffer = false;
     480     2042970 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     481     2017850 :                     nXOff <= nLBlockX * nBlockXSize &&
     482     2016190 :                     nYOff <= nLBlockY * nBlockYSize &&
     483       12145 :                     (nXOff + nXSize >= nXRight ||
     484             :                      // cppcheck-suppress knownConditionTrueFalse
     485     4864140 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     486       11965 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     487       10743 :                      (nYOff + nYSize == GetYSize() &&
     488        1951 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     489             :                 {
     490        3173 :                     bJustInitialize = true;
     491        3173 :                     bMemZeroBuffer = true;
     492             :                 }
     493             : 
     494             :                 /* --------------------------------------------------------------------
     495             :                  */
     496             :                 /*      Ensure we have the appropriate block loaded. */
     497             :                 /* --------------------------------------------------------------------
     498             :                  */
     499     2818450 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     500     2818450 :                 poBlock =
     501     2818450 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     502     2818450 :                 if (!poBlock)
     503             :                 {
     504          73 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     505             :                         nullptr)
     506             :                     {
     507           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     508             :                                  "GetBlockRef failed at X block offset %d, "
     509             :                                  "Y block offset %d%s",
     510             :                                  nLBlockX, nLBlockY,
     511           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     512           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     513             :                                      : "");
     514             :                     }
     515          73 :                     return (CE_Failure);
     516             :                 }
     517             : 
     518     2818380 :                 if (eRWFlag == GF_Write)
     519     2042970 :                     poBlock->MarkDirty();
     520             : 
     521     2818380 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     522     2818380 :                 if (bMemZeroBuffer)
     523             :                 {
     524        3173 :                     memset(pabySrcBlock, 0,
     525        3173 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     526        3173 :                                nBlockXSize * nBlockYSize);
     527             :                 }
     528             :                 /* --------------------------------------------------------------------
     529             :                  */
     530             :                 /*      Copy over this chunk of data. */
     531             :                 /* --------------------------------------------------------------------
     532             :                  */
     533     2818380 :                 GPtrDiff_t iSrcOffset =
     534     2818380 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     535     2818380 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     536     2818380 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     537     2818380 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     538     2818380 :                          nBlockXSize) *
     539     2818380 :                     nBandDataSize;
     540             :                 // Fill up as many rows as possible for the loaded block.
     541     5636750 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     542     2818380 :                                           nBufYSize - iBufYOff);
     543    60959400 :                 for (int k = 0; k < kmax; k++)
     544             :                 {
     545    58141000 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     546             :                     {
     547    53739300 :                         if (eRWFlag == GF_Read)
     548    49301200 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     549    49301200 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     550    49301200 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     551             :                         else
     552     4438130 :                             memcpy(pabySrcBlock + iSrcOffset,
     553     4438130 :                                    static_cast<GByte *>(pData) + iBufOffset +
     554     4438130 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     555             :                                    nXSpanSize);
     556             :                     }
     557             :                     else
     558             :                     {
     559             :                         /* type to type conversion */
     560     4401720 :                         if (eRWFlag == GF_Read)
     561     4251510 :                             GDALCopyWords64(
     562     4251510 :                                 pabySrcBlock + iSrcOffset, eDataType,
     563             :                                 nBandDataSize,
     564     4251510 :                                 static_cast<GByte *>(pData) + iBufOffset +
     565     4251510 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     566             :                                 eBufType, static_cast<int>(nPixelSpace),
     567             :                                 nXSpan);
     568             :                         else
     569      150209 :                             GDALCopyWords64(
     570      150209 :                                 static_cast<GByte *>(pData) + iBufOffset +
     571      150209 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     572             :                                 eBufType, static_cast<int>(nPixelSpace),
     573      150209 :                                 pabySrcBlock + iSrcOffset, eDataType,
     574             :                                 nBandDataSize, nXSpan);
     575             :                     }
     576             : 
     577    58141000 :                     iSrcOffset +=
     578    58141000 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     579             :                 }
     580             : 
     581             :                 iBufOffset =
     582     2818380 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     583     2818380 :                 nLBlockX++;
     584     2818380 :                 iSrcX += nXSpan;
     585             : 
     586     2818380 :                 poBlock->DropLock();
     587     2818380 :                 poBlock = nullptr;
     588             :             }
     589             : 
     590             :             /* Compute the increment to go on a block boundary */
     591     2543870 :             nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     592             : 
     593     2545760 :             if (psExtraArg->pfnProgress != nullptr &&
     594        1884 :                 !psExtraArg->pfnProgress(
     595     2545760 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     596             :                     psExtraArg->pProgressData))
     597             :             {
     598           0 :                 return CE_Failure;
     599             :             }
     600             :         }
     601             : 
     602     2503040 :         return CE_None;
     603             :     }
     604             : 
     605             :     /* ==================================================================== */
     606             :     /*      Loop reading required source blocks to satisfy output           */
     607             :     /*      request.  This is the most general implementation.              */
     608             :     /* ==================================================================== */
     609             : 
     610      577855 :     double dfXOff = nXOff;
     611      577855 :     double dfYOff = nYOff;
     612      577855 :     double dfXSize = nXSize;
     613      577855 :     double dfYSize = nYSize;
     614      577855 :     if (psExtraArg->bFloatingPointWindowValidity)
     615             :     {
     616      242949 :         dfXOff = psExtraArg->dfXOff;
     617      242949 :         dfYOff = psExtraArg->dfYOff;
     618      242949 :         dfXSize = psExtraArg->dfXSize;
     619      242949 :         dfYSize = psExtraArg->dfYSize;
     620             :     }
     621             : 
     622             :     /* -------------------------------------------------------------------- */
     623             :     /*      Compute stepping increment.                                     */
     624             :     /* -------------------------------------------------------------------- */
     625      577855 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     626      577855 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     627      577855 :     CPLErr eErr = CE_None;
     628             : 
     629      577855 :     if (eRWFlag == GF_Write)
     630             :     {
     631             :         /* --------------------------------------------------------------------
     632             :          */
     633             :         /*    Write case */
     634             :         /*    Loop over raster window computing source locations in the buffer.
     635             :          */
     636             :         /* --------------------------------------------------------------------
     637             :          */
     638      166655 :         GByte *pabyDstBlock = nullptr;
     639      166655 :         int nLBlockX = -1;
     640      166655 :         int nLBlockY = -1;
     641             : 
     642     1260010 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     643             :         {
     644     1093360 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     645             : 
     646    12384200 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     647             :             {
     648    11290800 :                 const int iBufXOff =
     649    11290800 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     650    11290800 :                 GPtrDiff_t iBufOffset =
     651    11290800 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     652             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     653    11290800 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     654             : 
     655             :                 // FIXME: this code likely doesn't work if the dirty block gets
     656             :                 // flushed to disk before being completely written.
     657             :                 // In the meantime, bJustInitialize should probably be set to
     658             :                 // FALSE even if it is not ideal performance wise, and for
     659             :                 // lossy compression.
     660             : 
     661             :                 /* --------------------------------------------------------------------
     662             :                  */
     663             :                 /*      Ensure we have the appropriate block loaded. */
     664             :                 /* --------------------------------------------------------------------
     665             :                  */
     666    11290800 :                 if (iDstX < nLBlockX * nBlockXSize ||
     667    11041500 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     668    10584800 :                     iDstY < nLBlockY * nBlockYSize ||
     669    10584800 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     670             :                 {
     671      738702 :                     nLBlockX = iDstX / nBlockXSize;
     672      738702 :                     nLBlockY = iDstY / nBlockYSize;
     673             : 
     674      738702 :                     const bool bJustInitialize =
     675     1065990 :                         nYOff <= nLBlockY * nBlockYSize &&
     676      327291 :                         nYOff + nYSize - nBlockYSize >=
     677      327291 :                             nLBlockY * nBlockYSize &&
     678     1116320 :                         nXOff <= nLBlockX * nBlockXSize &&
     679       50325 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     680             :                     /*bool bMemZeroBuffer = FALSE;
     681             :                     if( !bJustInitialize &&
     682             :                         nXOff <= nLBlockX * nBlockXSize &&
     683             :                         nYOff <= nLBlockY * nBlockYSize &&
     684             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     685             :                          (nXOff + nXSize == GetXSize() &&
     686             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     687             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     688             :                          (nYOff + nYSize == GetYSize() &&
     689             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     690             :                     {
     691             :                         bJustInitialize = TRUE;
     692             :                         bMemZeroBuffer = TRUE;
     693             :                     }*/
     694      738702 :                     if (poBlock != nullptr)
     695      572047 :                         poBlock->DropLock();
     696             : 
     697      738702 :                     poBlock =
     698      738702 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     699      738702 :                     if (poBlock == nullptr)
     700             :                     {
     701           0 :                         return (CE_Failure);
     702             :                     }
     703             : 
     704      738702 :                     poBlock->MarkDirty();
     705             : 
     706      738702 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     707             :                     /*if( bMemZeroBuffer )
     708             :                     {
     709             :                         memset(pabyDstBlock, 0,
     710             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     711             :                     * nBlockYSize);
     712             :                     }*/
     713             :                 }
     714             : 
     715             :                 // To make Coverity happy. Should not happen by design.
     716    11290800 :                 if (pabyDstBlock == nullptr)
     717             :                 {
     718           0 :                     CPLAssert(false);
     719             :                     eErr = CE_Failure;
     720             :                     break;
     721             :                 }
     722             : 
     723             :                 /* --------------------------------------------------------------------
     724             :                  */
     725             :                 /*      Copy over this pixel of data. */
     726             :                 /* --------------------------------------------------------------------
     727             :                  */
     728    11290800 :                 GPtrDiff_t iDstOffset =
     729    11290800 :                     (static_cast<GPtrDiff_t>(iDstX) -
     730    11290800 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     731    11290800 :                      (static_cast<GPtrDiff_t>(iDstY) -
     732    11290800 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     733    11290800 :                          nBlockXSize) *
     734    11290800 :                     nBandDataSize;
     735             : 
     736    11290800 :                 if (eDataType == eBufType)
     737             :                 {
     738    11287700 :                     memcpy(pabyDstBlock + iDstOffset,
     739    11287700 :                            static_cast<GByte *>(pData) + iBufOffset,
     740             :                            nBandDataSize);
     741             :                 }
     742             :                 else
     743             :                 {
     744             :                     /* type to type conversion ... ouch, this is expensive way
     745             :                     of handling single words */
     746        3096 :                     GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
     747        3096 :                                     eBufType, 0, pabyDstBlock + iDstOffset,
     748             :                                     eDataType, 0, 1);
     749             :                 }
     750             :             }
     751             : 
     752     1093360 :             if (psExtraArg->pfnProgress != nullptr &&
     753           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     754             :                                          psExtraArg->pProgressData))
     755             :             {
     756           0 :                 eErr = CE_Failure;
     757           0 :                 break;
     758             :             }
     759             :         }
     760             :     }
     761             :     else
     762             :     {
     763      411200 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     764             :         {
     765       41928 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     766       13514 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     767       13476 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     768       28455 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     769        3191 :                 GetColorTable() != nullptr)
     770             :             {
     771           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     772             :                          "Resampling method not supported on paletted band. "
     773             :                          "Falling back to nearest neighbour");
     774             :             }
     775       14210 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     776           3 :                      GDALDataTypeIsComplex(eDataType))
     777             :             {
     778           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     779             :                          "Resampling method not supported on complex data type "
     780             :                          "band. Falling back to nearest neighbour");
     781             :             }
     782             :             else
     783             :             {
     784       14207 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     785             :                                          pData, nBufXSize, nBufYSize, eBufType,
     786       14207 :                                          nPixelSpace, nLineSpace, psExtraArg);
     787             :             }
     788             :         }
     789             : 
     790      396993 :         int nLimitBlockY = 0;
     791      396993 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     792      396993 :         int nStartBlockX = -nBlockXSize;
     793      396993 :         constexpr double EPS = 1e-10;
     794      396993 :         int nLBlockY = -1;
     795      396993 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     796      396993 :         const bool bIntegerXFactor =
     797      372760 :             bUseIntegerRequestCoords &&
     798      670822 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     799      273829 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     800             : 
     801             :         /* --------------------------------------------------------------------
     802             :          */
     803             :         /*      Read case */
     804             :         /*      Loop over buffer computing source locations. */
     805             :         /* --------------------------------------------------------------------
     806             :          */
     807     2367020 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     808             :         {
     809             :             // Add small epsilon to avoid some numeric precision issues.
     810     1970040 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     811     1970040 :             const int iSrcY = static_cast<int>(std::min(
     812     1970040 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     813             : 
     814     1970040 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     815             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     816             : 
     817     1970040 :             if (iSrcY >= nLimitBlockY)
     818             :             {
     819      438011 :                 nLBlockY = iSrcY / nBlockYSize;
     820      438011 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     821      438011 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     822      438011 :                     nLimitBlockY += nBlockYSize;
     823             :                 else
     824           0 :                     nLimitBlockY = INT_MAX;
     825             :                 // Make sure a new block is loaded.
     826      438011 :                 nStartBlockX = -nBlockXSize;
     827             :             }
     828     1532030 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     829             :             {
     830             :                 // Make sure a new block is loaded.
     831      437363 :                 nStartBlockX = -nBlockXSize;
     832             :             }
     833             : 
     834     1970040 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     835     1970040 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     836             : 
     837     1970040 :             if (bIntegerXFactor)
     838             :             {
     839      695780 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     840      695780 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     841      695780 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     842      695780 :                 bool bRet = false;
     843      695780 :                 if (bByteCopy)
     844             :                 {
     845      585772 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     846             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     847             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
     848             :                         GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
     849             :                         nLBlockY);
     850             :                 }
     851      110008 :                 else if (eDataType == eBufType)
     852             :                 {
     853      109783 :                     switch (nBandDataSize)
     854             :                     {
     855      109630 :                         case 2:
     856      109630 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     857             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     858             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     859             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     860             :                                 nBlockXSize, poBlock, nLBlockY);
     861      109630 :                             break;
     862          55 :                         case 4:
     863          55 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     864             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     865             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     866             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     867             :                                 nBlockXSize, poBlock, nLBlockY);
     868          55 :                             break;
     869          96 :                         case 8:
     870          96 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     871             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     872             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     873             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     874             :                                 nBlockXSize, poBlock, nLBlockY);
     875          96 :                             break;
     876           2 :                         case 16:
     877           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     878             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     879             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     880             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     881             :                                 nBlockXSize, poBlock, nLBlockY);
     882           2 :                             break;
     883           0 :                         default:
     884           0 :                             CPLAssert(false);
     885             :                             break;
     886             :                     }
     887             :                 }
     888             :                 else
     889             :                 {
     890         225 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     891             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     892             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     893             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     894             :                 }
     895      695780 :                 if (!bRet)
     896           1 :                     eErr = CE_Failure;
     897             :             }
     898             :             else
     899             :             {
     900     1274260 :                 double dfSrcX = dfSrcXStart;
     901   503811000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     902   502537000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     903             :                 {
     904             :                     // TODO?: try to avoid the clamping for most iterations
     905             :                     const int iSrcX = static_cast<int>(
     906  1005070000 :                         std::min(std::max(0.0, dfSrcX),
     907   502537000 :                                  static_cast<double>(nRasterXSize - 1)));
     908             : 
     909             :                     /* --------------------------------------------------------------------
     910             :                      */
     911             :                     /*      Ensure we have the appropriate block loaded. */
     912             :                     /* --------------------------------------------------------------------
     913             :                      */
     914   502537000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     915             :                     {
     916     1697820 :                         const int nLBlockX = iSrcX / nBlockXSize;
     917     1697820 :                         nStartBlockX = nLBlockX * nBlockXSize;
     918             : 
     919     1697820 :                         if (poBlock != nullptr)
     920     1574650 :                             poBlock->DropLock();
     921             : 
     922     1697820 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     923     1697820 :                         if (poBlock == nullptr)
     924             :                         {
     925           9 :                             eErr = CE_Failure;
     926           9 :                             break;
     927             :                         }
     928             : 
     929             :                         pabySrcBlock =
     930     1697810 :                             static_cast<GByte *>(poBlock->GetDataRef());
     931             :                     }
     932   502537000 :                     const GPtrDiff_t nDiffX =
     933   502537000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     934             : 
     935             :                     /* --------------------------------------------------------------------
     936             :                      */
     937             :                     /*      Copy over this pixel of data. */
     938             :                     /* --------------------------------------------------------------------
     939             :                      */
     940             : 
     941   502537000 :                     if (bByteCopy)
     942             :                     {
     943   442592000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     944   442592000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     945   442592000 :                             pabySrcBlock[iSrcOffset];
     946             :                     }
     947    59944700 :                     else if (eDataType == eBufType)
     948             :                     {
     949    50322800 :                         GPtrDiff_t iSrcOffset =
     950    50322800 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     951    50322800 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     952    50322800 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     953             :                     }
     954             :                     else
     955             :                     {
     956             :                         // Type to type conversion ...
     957     9621890 :                         GPtrDiff_t iSrcOffset =
     958     9621890 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     959     9621890 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
     960             :                                         static_cast<GByte *>(pData) +
     961     9621890 :                                             iBufOffset,
     962             :                                         eBufType, 0, 1);
     963             :                     }
     964             : 
     965   502537000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     966             :                 }
     967             :             }
     968     1970040 :             if (eErr == CE_Failure)
     969          11 :                 break;
     970             : 
     971     2191390 :             if (psExtraArg->pfnProgress != nullptr &&
     972      221364 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     973             :                                          psExtraArg->pProgressData))
     974             :             {
     975           1 :                 eErr = CE_Failure;
     976           1 :                 break;
     977             :             }
     978             :         }
     979             :     }
     980             : 
     981      563648 :     if (poBlock != nullptr)
     982      563638 :         poBlock->DropLock();
     983             : 
     984      563648 :     return eErr;
     985             : }
     986             : 
     987             : /************************************************************************/
     988             : /*                      GDALRasterIOTransformer()                       */
     989             : /************************************************************************/
     990             : 
     991             : struct GDALRasterIOTransformerStruct
     992             : {
     993             :     double dfXOff;
     994             :     double dfYOff;
     995             :     double dfXRatioDstToSrc;
     996             :     double dfYRatioDstToSrc;
     997             : };
     998             : 
     999        6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
    1000             :                                    int nPointCount, double *x, double *y,
    1001             :                                    double * /* z */, int *panSuccess)
    1002             : {
    1003        6897 :     GDALRasterIOTransformerStruct *psParams =
    1004             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
    1005        6897 :     if (bDstToSrc)
    1006             :     {
    1007      311993 :         for (int i = 0; i < nPointCount; i++)
    1008             :         {
    1009      305684 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
    1010      305684 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
    1011      305684 :             panSuccess[i] = TRUE;
    1012             :         }
    1013             :     }
    1014             :     else
    1015             :     {
    1016        1176 :         for (int i = 0; i < nPointCount; i++)
    1017             :         {
    1018         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
    1019         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
    1020         588 :             panSuccess[i] = TRUE;
    1021             :         }
    1022             :     }
    1023        6897 :     return TRUE;
    1024             : }
    1025             : 
    1026             : /************************************************************************/
    1027             : /*                         RasterIOResampled()                          */
    1028             : /************************************************************************/
    1029             : 
    1030             : //! @cond Doxygen_Suppress
    1031       14207 : CPLErr GDALRasterBand::RasterIOResampled(
    1032             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1033             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1034             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1035             : {
    1036             :     // Determine if we use warping resampling or overview resampling
    1037             :     const bool bUseWarp =
    1038       14207 :         (GDALDataTypeIsComplex(eDataType) &&
    1039       14366 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1040         159 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1041             : 
    1042       14207 :     double dfXOff = nXOff;
    1043       14207 :     double dfYOff = nYOff;
    1044       14207 :     double dfXSize = nXSize;
    1045       14207 :     double dfYSize = nYSize;
    1046       14207 :     if (psExtraArg->bFloatingPointWindowValidity)
    1047             :     {
    1048       13512 :         dfXOff = psExtraArg->dfXOff;
    1049       13512 :         dfYOff = psExtraArg->dfYOff;
    1050       13512 :         dfXSize = psExtraArg->dfXSize;
    1051       13512 :         dfYSize = psExtraArg->dfYSize;
    1052             :     }
    1053             : 
    1054       14207 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1055       14207 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1056             : 
    1057             :     // Determine the coordinates in the "virtual" output raster to see
    1058             :     // if there are not integers, in which case we will use them as a shift
    1059             :     // so that subwindow extracts give the exact same results as entire raster
    1060             :     // scaling.
    1061       14207 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1062       14207 :     bool bHasXOffVirtual = false;
    1063       14207 :     int nDestXOffVirtual = 0;
    1064       14207 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1065             :     {
    1066       13879 :         bHasXOffVirtual = true;
    1067       13879 :         dfXOff = nXOff;
    1068       13879 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1069             :     }
    1070             : 
    1071       14207 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1072       14207 :     bool bHasYOffVirtual = false;
    1073       14207 :     int nDestYOffVirtual = 0;
    1074       14207 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1075             :     {
    1076       13875 :         bHasYOffVirtual = true;
    1077       13875 :         dfYOff = nYOff;
    1078       13875 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1079             :     }
    1080             : 
    1081             :     // Create a MEM dataset that wraps the output buffer.
    1082             :     GDALDataset *poMEMDS;
    1083       14207 :     void *pTempBuffer = nullptr;
    1084       14207 :     GSpacing nPSMem = nPixelSpace;
    1085       14207 :     GSpacing nLSMem = nLineSpace;
    1086       14207 :     void *pDataMem = pData;
    1087       14207 :     GDALDataType eDTMem = eBufType;
    1088       14207 :     if (eBufType != eDataType)
    1089             :     {
    1090          44 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1091          44 :         nLSMem = nPSMem * nBufXSize;
    1092             :         pTempBuffer =
    1093          44 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1094          44 :         if (pTempBuffer == nullptr)
    1095           0 :             return CE_Failure;
    1096          44 :         pDataMem = pTempBuffer;
    1097          44 :         eDTMem = eDataType;
    1098             :     }
    1099             : 
    1100             :     poMEMDS =
    1101       14207 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1102             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1103       14207 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1104       14207 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1105       14207 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1106             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1107       14207 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1108             : 
    1109       14207 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1110       14207 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1111       14207 :     if (pszNBITS)
    1112           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1113           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1114             : 
    1115       14207 :     CPLErr eErr = CE_None;
    1116             : 
    1117             :     // Do the resampling.
    1118       14207 :     if (bUseWarp)
    1119             :     {
    1120         149 :         int bHasNoData = FALSE;
    1121         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1122             : 
    1123         149 :         VRTDatasetH hVRTDS = nullptr;
    1124         149 :         GDALRasterBandH hVRTBand = nullptr;
    1125         149 :         if (GetDataset() == nullptr)
    1126             :         {
    1127             :             /* Create VRT dataset that wraps the whole dataset */
    1128           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1129           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1130           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1131           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1132             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1133             :                                VRT_NODATA_UNSET);
    1134             : 
    1135             :             /* Add a mask band if needed */
    1136           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1137             :             {
    1138           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1139             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1140             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1141             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1142           0 :                             ->GetMaskBand());
    1143           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1144           0 :                                                  nRasterYSize, 0, 0,
    1145           0 :                                                  nRasterXSize, nRasterYSize);
    1146             :             }
    1147             :         }
    1148             : 
    1149         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1150         149 :         switch (psExtraArg->eResampleAlg)
    1151             :         {
    1152           0 :             case GRIORA_NearestNeighbour:
    1153           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1154           0 :                 break;
    1155         147 :             case GRIORA_Bilinear:
    1156         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1157         147 :                 break;
    1158           0 :             case GRIORA_Cubic:
    1159           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1160           0 :                 break;
    1161           0 :             case GRIORA_CubicSpline:
    1162           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1163           0 :                 break;
    1164           0 :             case GRIORA_Lanczos:
    1165           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1166           0 :                 break;
    1167           0 :             case GRIORA_Average:
    1168           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1169           0 :                 break;
    1170           2 :             case GRIORA_RMS:
    1171           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1172           2 :                 break;
    1173           0 :             case GRIORA_Mode:
    1174           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1175           0 :                 break;
    1176           0 :             default:
    1177           0 :                 CPLAssert(false);
    1178             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1179             :                 break;
    1180             :         }
    1181         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1182         149 :         psWarpOptions->hDstDS = poMEMDS;
    1183         149 :         psWarpOptions->nBandCount = 1;
    1184         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1185         149 :         int nDstBandNumber = 1;
    1186         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1187         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1188         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1189         149 :                                          ? psExtraArg->pfnProgress
    1190             :                                          : GDALDummyProgress;
    1191         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1192         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1193         149 :         if (bHasNoData)
    1194             :         {
    1195           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1196             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1197           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1198             :             {
    1199           0 :                 psWarpOptions->padfSrcNoDataReal =
    1200           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1201           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1202             :             }
    1203             : 
    1204           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1205             :             {
    1206           0 :                 psWarpOptions->padfDstNoDataReal =
    1207           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1208           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1209             :             }
    1210             :         }
    1211             : 
    1212             :         GDALRasterIOTransformerStruct sTransformer;
    1213         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1214         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1215         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1216         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1217         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1218             : 
    1219             :         GDALWarpOperationH hWarpOperation =
    1220         149 :             GDALCreateWarpOperation(psWarpOptions);
    1221         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1222             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1223         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1224             : 
    1225         149 :         psWarpOptions->panSrcBands = nullptr;
    1226         149 :         psWarpOptions->panDstBands = nullptr;
    1227         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1228             : 
    1229         149 :         if (hVRTDS)
    1230           0 :             GDALClose(hVRTDS);
    1231             :     }
    1232             :     else
    1233             :     {
    1234       14058 :         const char *pszResampling =
    1235       25844 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1236       22879 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1237       22148 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1238       22069 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1239       11137 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1240         199 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1241          79 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1242           3 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1243             :                                                                : "UNKNOWN";
    1244             : 
    1245       14058 :         int nKernelRadius = 0;
    1246             :         GDALResampleFunction pfnResampleFunc =
    1247       14058 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1248       14058 :         CPLAssert(pfnResampleFunc);
    1249             :         GDALDataType eWrkDataType =
    1250       14058 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1251       14058 :         int nHasNoData = 0;
    1252       14058 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1253       14058 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1254       14058 :         if (!bHasNoData)
    1255       13968 :             dfNoDataValue = 0.0;
    1256             : 
    1257       14058 :         int nDstBlockXSize = nBufXSize;
    1258       14058 :         int nDstBlockYSize = nBufYSize;
    1259       14058 :         int nFullResXChunk = 0;
    1260       14058 :         int nFullResYChunk = 0;
    1261             :         while (true)
    1262             :         {
    1263       14069 :             nFullResXChunk =
    1264       14069 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1265       14069 :             nFullResYChunk =
    1266       14069 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1267       14069 :             if (nFullResXChunk > nRasterXSize)
    1268        4726 :                 nFullResXChunk = nRasterXSize;
    1269       14069 :             if (nFullResYChunk > nRasterYSize)
    1270         543 :                 nFullResYChunk = nRasterYSize;
    1271       14069 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1272       14011 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1273             :                  1024 * 1024))
    1274             :                 break;
    1275             :             // When operating on the full width of a raster whose block width is
    1276             :             // the raster width, prefer doing chunks in height.
    1277          11 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1278             :                 nDstBlockYSize > 1)
    1279           0 :                 nDstBlockYSize /= 2;
    1280             :             /* Otherwise cut the maximal dimension */
    1281          11 :             else if (nDstBlockXSize > 1 &&
    1282           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1283          11 :                 nDstBlockXSize /= 2;
    1284             :             else
    1285           0 :                 nDstBlockYSize /= 2;
    1286             :         }
    1287             : 
    1288       14058 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1289       14058 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1290       14058 :         if (nOvrXFactor == 0)
    1291        2029 :             nOvrXFactor = 1;
    1292       14058 :         if (nOvrYFactor == 0)
    1293        2028 :             nOvrYFactor = 1;
    1294       14058 :         int nFullResXSizeQueried =
    1295       14058 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1296       14058 :         int nFullResYSizeQueried =
    1297       14058 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1298             : 
    1299       14058 :         if (nFullResXSizeQueried > nRasterXSize)
    1300        2701 :             nFullResXSizeQueried = nRasterXSize;
    1301       14058 :         if (nFullResYSizeQueried > nRasterYSize)
    1302         299 :             nFullResYSizeQueried = nRasterYSize;
    1303             : 
    1304             :         void *pChunk =
    1305       14058 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1306             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1307       14058 :         GByte *pabyChunkNoDataMask = nullptr;
    1308             : 
    1309       14058 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1310       14058 :         int l_nMaskFlags = GetMaskFlags();
    1311             : 
    1312       14058 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1313       14058 :         if (bUseNoDataMask)
    1314             :         {
    1315        7483 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1316             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1317             :         }
    1318       14058 :         if (pChunk == nullptr ||
    1319        7483 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1320             :         {
    1321           0 :             GDALClose(poMEMDS);
    1322           0 :             CPLFree(pChunk);
    1323           0 :             CPLFree(pabyChunkNoDataMask);
    1324           0 :             VSIFree(pTempBuffer);
    1325           0 :             return CE_Failure;
    1326             :         }
    1327             : 
    1328       14058 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1329       14058 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1330       14058 :         int nBlocksDone = 0;
    1331             : 
    1332             :         int nDstYOff;
    1333       28116 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1334       14058 :              nDstYOff += nDstBlockYSize)
    1335             :         {
    1336             :             int nDstYCount;
    1337       14058 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1338       14058 :                 nDstYCount = nDstBlockYSize;
    1339             :             else
    1340           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1341             : 
    1342       14058 :             int nChunkYOff =
    1343       14058 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1344       14058 :             int nChunkYOff2 = nYOff + 1 +
    1345       14058 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1346             :                                                     dfYRatioDstToSrc));
    1347       14058 :             if (nChunkYOff2 > nRasterYSize)
    1348         731 :                 nChunkYOff2 = nRasterYSize;
    1349       14058 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1350       14058 :             CPLAssert(nYCount <= nFullResYChunk);
    1351             : 
    1352       14058 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1353       14058 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1354       14058 :             if (nChunkYOffQueried < 0)
    1355             :             {
    1356         458 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1357         458 :                 nChunkYOffQueried = 0;
    1358             :             }
    1359       14058 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1360         561 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1361       14058 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1362             : 
    1363       14058 :             int nDstXOff = 0;
    1364       28116 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1365       14058 :                  nDstXOff += nDstBlockXSize)
    1366             :             {
    1367       14058 :                 int nDstXCount = 0;
    1368       14058 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1369       14058 :                     nDstXCount = nDstBlockXSize;
    1370             :                 else
    1371           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1372             : 
    1373       14058 :                 int nChunkXOff =
    1374       14058 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1375       14058 :                 int nChunkXOff2 =
    1376       14058 :                     nXOff + 1 +
    1377       14058 :                     static_cast<int>(
    1378       14058 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1379       14058 :                 if (nChunkXOff2 > nRasterXSize)
    1380        8751 :                     nChunkXOff2 = nRasterXSize;
    1381       14058 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1382       14058 :                 CPLAssert(nXCount <= nFullResXChunk);
    1383             : 
    1384       14058 :                 int nChunkXOffQueried =
    1385       14058 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1386       14058 :                 int nChunkXSizeQueried =
    1387       14058 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1388       14058 :                 if (nChunkXOffQueried < 0)
    1389             :                 {
    1390        2762 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1391        2762 :                     nChunkXOffQueried = 0;
    1392             :                 }
    1393       14058 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1394        2748 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1395       14058 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1396             : 
    1397             :                 // Read the source buffers.
    1398       14058 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1399             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1400             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1401             :                                 eWrkDataType, 0, 0, nullptr);
    1402             : 
    1403       14058 :                 bool bSkipResample = false;
    1404       14058 :                 bool bNoDataMaskFullyOpaque = false;
    1405       14058 :                 if (eErr == CE_None && bUseNoDataMask)
    1406             :                 {
    1407        7483 :                     eErr = poMaskBand->RasterIO(
    1408             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1409             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1410             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1411             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1412             : 
    1413             :                     /* Optimizations if mask if fully opaque or transparent */
    1414        7483 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1415        7483 :                     GByte bVal = pabyChunkNoDataMask[0];
    1416        7483 :                     int i = 1;
    1417    15232100 :                     for (; i < nPixels; i++)
    1418             :                     {
    1419    15225700 :                         if (pabyChunkNoDataMask[i] != bVal)
    1420        1126 :                             break;
    1421             :                     }
    1422        7483 :                     if (i == nPixels)
    1423             :                     {
    1424        6357 :                         if (bVal == 0)
    1425             :                         {
    1426       12094 :                             for (int j = 0; j < nDstYCount; j++)
    1427             :                             {
    1428        6377 :                                 GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
    1429             :                                                 static_cast<GByte *>(pDataMem) +
    1430        6377 :                                                     nLSMem * (j + nDstYOff) +
    1431        6377 :                                                     nDstXOff * nPSMem,
    1432             :                                                 eDTMem,
    1433             :                                                 static_cast<int>(nPSMem),
    1434             :                                                 nDstXCount);
    1435             :                             }
    1436        5717 :                             bSkipResample = true;
    1437             :                         }
    1438             :                         else
    1439             :                         {
    1440         640 :                             bNoDataMaskFullyOpaque = true;
    1441             :                         }
    1442             :                     }
    1443             :                 }
    1444             : 
    1445       14058 :                 if (!bSkipResample && eErr == CE_None)
    1446             :                 {
    1447        8338 :                     const bool bPropagateNoData = false;
    1448        8338 :                     void *pDstBuffer = nullptr;
    1449        8338 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1450             :                     GDALRasterBand *poMEMBand =
    1451        8338 :                         GDALRasterBand::FromHandle(hMEMBand);
    1452        8338 :                     GDALOverviewResampleArgs args;
    1453        8338 :                     args.eSrcDataType = eDataType;
    1454        8338 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1455        8338 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1456        8338 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1457        8338 :                     args.nOvrNBITS = nNBITS;
    1458        8338 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1459        8338 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1460        8338 :                     args.dfSrcXDelta =
    1461        8338 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1462        8338 :                     args.dfSrcYDelta =
    1463        8338 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1464        8338 :                     args.eWrkDataType = eWrkDataType;
    1465        8338 :                     args.pabyChunkNodataMask =
    1466        8338 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1467        8338 :                     args.nChunkXOff =
    1468        8338 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1469        8338 :                     args.nChunkXSize = nChunkXSizeQueried;
    1470        8338 :                     args.nChunkYOff =
    1471        8338 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1472        8338 :                     args.nChunkYSize = nChunkYSizeQueried;
    1473        8338 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1474        8338 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1475        8338 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1476        8338 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1477        8338 :                     args.pszResampling = pszResampling;
    1478        8338 :                     args.bHasNoData = bHasNoData;
    1479        8338 :                     args.dfNoDataValue = dfNoDataValue;
    1480        8338 :                     args.poColorTable = GetColorTable();
    1481        8338 :                     args.bPropagateNoData = bPropagateNoData;
    1482        8338 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1483             :                                            &eDstBufferDataType);
    1484        8338 :                     if (eErr == CE_None)
    1485             :                     {
    1486        8338 :                         eErr = poMEMBand->RasterIO(
    1487             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1488             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1489             :                             pDstBuffer, nDstXCount, nDstYCount,
    1490             :                             eDstBufferDataType, 0, 0, nullptr);
    1491             :                     }
    1492        8338 :                     CPLFree(pDstBuffer);
    1493             :                 }
    1494             : 
    1495       14058 :                 nBlocksDone++;
    1496       24980 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1497       10922 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1498             :                                              "", psExtraArg->pProgressData))
    1499             :                 {
    1500           1 :                     eErr = CE_Failure;
    1501             :                 }
    1502             :             }
    1503             :         }
    1504             : 
    1505       14058 :         CPLFree(pChunk);
    1506       14058 :         CPLFree(pabyChunkNoDataMask);
    1507             :     }
    1508             : 
    1509       14207 :     if (eBufType != eDataType)
    1510             :     {
    1511          44 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1512             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1513             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1514             :             nullptr));
    1515             :     }
    1516       14207 :     GDALClose(poMEMDS);
    1517       14207 :     VSIFree(pTempBuffer);
    1518             : 
    1519       14207 :     return eErr;
    1520             : }
    1521             : 
    1522             : /************************************************************************/
    1523             : /*                         RasterIOResampled()                          */
    1524             : /************************************************************************/
    1525             : 
    1526         886 : CPLErr GDALDataset::RasterIOResampled(
    1527             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1528             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1529             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1530             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1531             : 
    1532             : {
    1533             : #if 0
    1534             :     // Determine if we use warping resampling or overview resampling
    1535             :     bool bUseWarp = false;
    1536             :     if( GDALDataTypeIsComplex( eDataType ) )
    1537             :         bUseWarp = true;
    1538             : #endif
    1539             : 
    1540         886 :     double dfXOff = nXOff;
    1541         886 :     double dfYOff = nYOff;
    1542         886 :     double dfXSize = nXSize;
    1543         886 :     double dfYSize = nYSize;
    1544         886 :     if (psExtraArg->bFloatingPointWindowValidity)
    1545             :     {
    1546         765 :         dfXOff = psExtraArg->dfXOff;
    1547         765 :         dfYOff = psExtraArg->dfYOff;
    1548         765 :         dfXSize = psExtraArg->dfXSize;
    1549         765 :         dfYSize = psExtraArg->dfYSize;
    1550             :     }
    1551             : 
    1552         886 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1553         886 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1554             : 
    1555             :     // Determine the coordinates in the "virtual" output raster to see
    1556             :     // if there are not integers, in which case we will use them as a shift
    1557             :     // so that subwindow extracts give the exact same results as entire raster
    1558             :     // scaling.
    1559         886 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1560         886 :     bool bHasXOffVirtual = false;
    1561         886 :     int nDestXOffVirtual = 0;
    1562         886 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1563             :     {
    1564         761 :         bHasXOffVirtual = true;
    1565         761 :         dfXOff = nXOff;
    1566         761 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1567             :     }
    1568             : 
    1569         886 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1570         886 :     bool bHasYOffVirtual = false;
    1571         886 :     int nDestYOffVirtual = 0;
    1572         886 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1573             :     {
    1574         721 :         bHasYOffVirtual = true;
    1575         721 :         dfYOff = nYOff;
    1576         721 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1577             :     }
    1578             : 
    1579             :     // Create a MEM dataset that wraps the output buffer.
    1580             :     GDALDataset *poMEMDS =
    1581         886 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1582             :                            nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
    1583             :     GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
    1584         886 :         CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
    1585         886 :     int nNBITS = 0;
    1586        2878 :     for (int i = 0; i < nBandCount; i++)
    1587             :     {
    1588        1992 :         char szBuffer[32] = {'\0'};
    1589        3984 :         int nRet = CPLPrintPointer(
    1590             :             szBuffer,
    1591        1992 :             static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
    1592        1992 :                 nLineSpace * nDestYOffVirtual + nBandSpace * i,
    1593             :             sizeof(szBuffer));
    1594        1992 :         szBuffer[nRet] = 0;
    1595             : 
    1596        1992 :         char szBuffer0[64] = {'\0'};
    1597        1992 :         snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
    1598             : 
    1599        1992 :         char szBuffer1[64] = {'\0'};
    1600        1992 :         snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
    1601             :                  static_cast<GIntBig>(nPixelSpace));
    1602             : 
    1603        1992 :         char szBuffer2[64] = {'\0'};
    1604        1992 :         snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
    1605             :                  static_cast<GIntBig>(nLineSpace));
    1606             : 
    1607        1992 :         char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
    1608             : 
    1609        1992 :         poMEMDS->AddBand(eBufType, apszOptions);
    1610             : 
    1611        1992 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1612        1992 :         papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
    1613             :         const char *pszNBITS =
    1614        1992 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1615        1992 :         if (pszNBITS)
    1616             :         {
    1617           0 :             nNBITS = atoi(pszNBITS);
    1618           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1619           0 :                                                            "IMAGE_STRUCTURE");
    1620             :         }
    1621             :     }
    1622             : 
    1623         886 :     CPLErr eErr = CE_None;
    1624             : 
    1625             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1626             :     // Looks like this code was initially added as disable by copying
    1627             :     // from RasterIO here:
    1628             :     // https://trac.osgeo.org/gdal/changeset/29572
    1629             : #if 0
    1630             :     // Do the resampling.
    1631             :     if( bUseWarp )
    1632             :     {
    1633             :         VRTDatasetH hVRTDS = nullptr;
    1634             :         GDALRasterBandH hVRTBand = nullptr;
    1635             :         if( GetDataset() == nullptr )
    1636             :         {
    1637             :             /* Create VRT dataset that wraps the whole dataset */
    1638             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1639             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1640             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1641             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1642             :                                 (GDALRasterBandH)this,
    1643             :                                 0, 0,
    1644             :                                 nRasterXSize, nRasterYSize,
    1645             :                                 0, 0,
    1646             :                                 nRasterXSize, nRasterYSize,
    1647             :                                 nullptr, VRT_NODATA_UNSET );
    1648             : 
    1649             :             /* Add a mask band if needed */
    1650             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1651             :             {
    1652             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1653             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1654             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1655             :                 poVRTMaskBand->
    1656             :                     AddMaskBandSource( this,
    1657             :                                     0, 0,
    1658             :                                     nRasterXSize, nRasterYSize,
    1659             :                                     0, 0,
    1660             :                                     nRasterXSize, nRasterYSize);
    1661             :             }
    1662             :         }
    1663             : 
    1664             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1665             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1666             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1667             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1668             :         psWarpOptions->nBandCount = 1;
    1669             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1670             :         int nDstBandNumber = 1;
    1671             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1672             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1673             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1674             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1675             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1676             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1677             :         GDALRasterIOTransformerStruct sTransformer;
    1678             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1679             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1680             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1681             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1682             :         psWarpOptions->pTransformerArg = &sTransformer;
    1683             : 
    1684             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1685             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1686             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1687             :                                       nBufXSize, nBufYSize );
    1688             :         GDALDestroyWarpOperation( hWarpOperation );
    1689             : 
    1690             :         psWarpOptions->panSrcBands = nullptr;
    1691             :         psWarpOptions->panDstBands = nullptr;
    1692             :         GDALDestroyWarpOptions( psWarpOptions );
    1693             : 
    1694             :         if( hVRTDS )
    1695             :             GDALClose(hVRTDS);
    1696             :     }
    1697             :     else
    1698             : #endif
    1699             :     {
    1700         886 :         const char *pszResampling =
    1701        1653 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1702         767 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1703           0 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1704           0 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1705           0 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1706           0 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1707           0 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1708           0 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1709             :                                                                : "UNKNOWN";
    1710             : 
    1711         886 :         GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1712         886 :         GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1713             :         int nBlockXSize, nBlockYSize;
    1714         886 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1715             : 
    1716             :         int nKernelRadius;
    1717             :         GDALResampleFunction pfnResampleFunc =
    1718         886 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1719         886 :         CPLAssert(pfnResampleFunc);
    1720             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1721             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1722             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1723             : #endif
    1724             :         GDALDataType eWrkDataType =
    1725         886 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1726             : 
    1727         886 :         int nDstBlockXSize = nBufXSize;
    1728         886 :         int nDstBlockYSize = nBufYSize;
    1729             :         int nFullResXChunk, nFullResYChunk;
    1730             :         while (true)
    1731             :         {
    1732         886 :             nFullResXChunk =
    1733         886 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1734         886 :             nFullResYChunk =
    1735         886 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1736         886 :             if (nFullResXChunk > nRasterXSize)
    1737         585 :                 nFullResXChunk = nRasterXSize;
    1738         886 :             if (nFullResYChunk > nRasterYSize)
    1739          51 :                 nFullResYChunk = nRasterYSize;
    1740         886 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1741         884 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1742             :                  1024 * 1024))
    1743             :                 break;
    1744             :             // When operating on the full width of a raster whose block width is
    1745             :             // the raster width, prefer doing chunks in height.
    1746           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1747             :                 nDstBlockYSize > 1)
    1748           0 :                 nDstBlockYSize /= 2;
    1749             :             /* Otherwise cut the maximal dimension */
    1750           0 :             else if (nDstBlockXSize > 1 &&
    1751           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1752           0 :                 nDstBlockXSize /= 2;
    1753             :             else
    1754           0 :                 nDstBlockYSize /= 2;
    1755             :         }
    1756             : 
    1757        1772 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1758         886 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1759         886 :         if (nOvrFactor == 0)
    1760         104 :             nOvrFactor = 1;
    1761         886 :         int nFullResXSizeQueried =
    1762         886 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1763         886 :         int nFullResYSizeQueried =
    1764         886 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1765             : 
    1766         886 :         if (nFullResXSizeQueried > nRasterXSize)
    1767         610 :             nFullResXSizeQueried = nRasterXSize;
    1768         886 :         if (nFullResYSizeQueried > nRasterYSize)
    1769          54 :             nFullResYSizeQueried = nRasterYSize;
    1770             : 
    1771         886 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1772             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1773             :                               nBandCount),
    1774             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1775         886 :         GByte *pabyChunkNoDataMask = nullptr;
    1776             : 
    1777         886 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1778         886 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1779             : 
    1780         886 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1781         886 :         if (bUseNoDataMask)
    1782             :         {
    1783         617 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1784             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1785             :         }
    1786         886 :         if (pChunk == nullptr ||
    1787         617 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1788             :         {
    1789           0 :             GDALClose(poMEMDS);
    1790           0 :             CPLFree(pChunk);
    1791           0 :             CPLFree(pabyChunkNoDataMask);
    1792           0 :             CPLFree(papoDstBands);
    1793           0 :             return CE_Failure;
    1794             :         }
    1795             : 
    1796         886 :         const int nTotalBlocks = DIV_ROUND_UP(nBufXSize, nDstBlockXSize) *
    1797         886 :                                  DIV_ROUND_UP(nBufYSize, nDstBlockYSize);
    1798         886 :         int nBlocksDone = 0;
    1799             : 
    1800             :         int nDstYOff;
    1801        1772 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1802         886 :              nDstYOff += nDstBlockYSize)
    1803             :         {
    1804             :             int nDstYCount;
    1805         886 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1806         886 :                 nDstYCount = nDstBlockYSize;
    1807             :             else
    1808           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1809             : 
    1810         886 :             int nChunkYOff =
    1811         886 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1812         886 :             int nChunkYOff2 = nYOff + 1 +
    1813         886 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1814             :                                                     dfYRatioDstToSrc));
    1815         886 :             if (nChunkYOff2 > nRasterYSize)
    1816         133 :                 nChunkYOff2 = nRasterYSize;
    1817         886 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1818         886 :             CPLAssert(nYCount <= nFullResYChunk);
    1819             : 
    1820         886 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1821         886 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1822         886 :             if (nChunkYOffQueried < 0)
    1823             :             {
    1824         136 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1825         136 :                 nChunkYOffQueried = 0;
    1826             :             }
    1827         886 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1828         151 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1829         886 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1830             : 
    1831             :             int nDstXOff;
    1832        1772 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1833         886 :                  nDstXOff += nDstBlockXSize)
    1834             :             {
    1835             :                 int nDstXCount;
    1836         886 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1837         886 :                     nDstXCount = nDstBlockXSize;
    1838             :                 else
    1839           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1840             : 
    1841         886 :                 int nChunkXOff =
    1842         886 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1843         886 :                 int nChunkXOff2 =
    1844         886 :                     nXOff + 1 +
    1845         886 :                     static_cast<int>(
    1846         886 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1847         886 :                 if (nChunkXOff2 > nRasterXSize)
    1848         641 :                     nChunkXOff2 = nRasterXSize;
    1849         886 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1850         886 :                 CPLAssert(nXCount <= nFullResXChunk);
    1851             : 
    1852         886 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1853         886 :                 int nChunkXSizeQueried =
    1854         886 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1855         886 :                 if (nChunkXOffQueried < 0)
    1856             :                 {
    1857         641 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1858         641 :                     nChunkXOffQueried = 0;
    1859             :                 }
    1860         886 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1861         649 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1862         886 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1863             : 
    1864         886 :                 bool bSkipResample = false;
    1865         886 :                 bool bNoDataMaskFullyOpaque = false;
    1866         886 :                 if (eErr == CE_None && bUseNoDataMask)
    1867             :                 {
    1868         617 :                     eErr = poMaskBand->RasterIO(
    1869             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1870             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1871             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1872             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1873             : 
    1874             :                     /* Optimizations if mask if fully opaque or transparent */
    1875         617 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1876         617 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1877         617 :                     int i = 1;  // Used after for.
    1878    48197000 :                     for (; i < nPixels; i++)
    1879             :                     {
    1880    48196500 :                         if (pabyChunkNoDataMask[i] != bVal)
    1881          72 :                             break;
    1882             :                     }
    1883         617 :                     if (i == nPixels)
    1884             :                     {
    1885         545 :                         if (bVal == 0)
    1886             :                         {
    1887         373 :                             GByte abyZero[16] = {0};
    1888         780 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1889             :                             {
    1890        3499 :                                 for (int j = 0; j < nDstYCount; j++)
    1891             :                                 {
    1892        3092 :                                     GDALCopyWords64(
    1893             :                                         abyZero, GDT_UInt8, 0,
    1894             :                                         static_cast<GByte *>(pData) +
    1895        3092 :                                             iBand * nBandSpace +
    1896        3092 :                                             nLineSpace * (j + nDstYOff) +
    1897        3092 :                                             nDstXOff * nPixelSpace,
    1898             :                                         eBufType, static_cast<int>(nPixelSpace),
    1899             :                                         nDstXCount);
    1900             :                                 }
    1901             :                             }
    1902         373 :                             bSkipResample = true;
    1903             :                         }
    1904             :                         else
    1905             :                         {
    1906         172 :                             bNoDataMaskFullyOpaque = true;
    1907             :                         }
    1908             :                     }
    1909             :                 }
    1910             : 
    1911         886 :                 if (!bSkipResample && eErr == CE_None)
    1912             :                 {
    1913             :                     /* Read the source buffers */
    1914         510 :                     eErr = RasterIO(
    1915             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1916             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1917             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1918             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1919             :                 }
    1920             : 
    1921             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1922             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1923             :                     eErr == CE_None)
    1924             :                 {
    1925             :                     eErr = pfnResampleFuncMultiBands(
    1926             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1927             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1928             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1929             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1930             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1931             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1932             :                         nChunkXSizeQueried,
    1933             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1934             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1935             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1936             :                         nDstYOff + nDestYOffVirtual,
    1937             :                         nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
    1938             :                         pszResampling, FALSE /*bHasNoData*/,
    1939             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1940             :                         eDataType);
    1941             :                 }
    1942             :                 else
    1943             : #endif
    1944             :                 {
    1945             :                     size_t nChunkBandOffset =
    1946         886 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1947         886 :                         nChunkYSizeQueried *
    1948         886 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1949        2462 :                     for (int i = 0;
    1950        2462 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1951             :                          i++)
    1952             :                     {
    1953        1576 :                         const bool bPropagateNoData = false;
    1954        1576 :                         void *pDstBuffer = nullptr;
    1955        1576 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1956             :                         GDALRasterBand *poMEMBand =
    1957        1576 :                             poMEMDS->GetRasterBand(i + 1);
    1958        1576 :                         GDALOverviewResampleArgs args;
    1959        1576 :                         args.eSrcDataType = eDataType;
    1960        1576 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1961        1576 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1962        1576 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1963        1576 :                         args.nOvrNBITS = nNBITS;
    1964        1576 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1965        1576 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1966        1576 :                         args.dfSrcXDelta =
    1967        1576 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1968        1576 :                         args.dfSrcYDelta =
    1969        1576 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1970        1576 :                         args.eWrkDataType = eWrkDataType;
    1971        1576 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1972        1576 :                                                        ? nullptr
    1973             :                                                        : pabyChunkNoDataMask;
    1974        1576 :                         args.nChunkXOff =
    1975        1576 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1976        1576 :                         args.nChunkXSize = nChunkXSizeQueried;
    1977        1576 :                         args.nChunkYOff =
    1978        1576 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1979        1576 :                         args.nChunkYSize = nChunkYSizeQueried;
    1980        1576 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1981        1576 :                         args.nDstXOff2 =
    1982        1576 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1983        1576 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1984        1576 :                         args.nDstYOff2 =
    1985        1576 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1986        1576 :                         args.pszResampling = pszResampling;
    1987        1576 :                         args.bHasNoData = false;
    1988        1576 :                         args.dfNoDataValue = 0.0;
    1989        1576 :                         args.poColorTable = nullptr;
    1990        1576 :                         args.bPropagateNoData = bPropagateNoData;
    1991             : 
    1992             :                         eErr =
    1993        3152 :                             pfnResampleFunc(args,
    1994        1576 :                                             reinterpret_cast<GByte *>(pChunk) +
    1995        1576 :                                                 i * nChunkBandOffset,
    1996             :                                             &pDstBuffer, &eDstBufferDataType);
    1997        1576 :                         if (eErr == CE_None)
    1998             :                         {
    1999        1576 :                             eErr = poMEMBand->RasterIO(
    2000             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    2001             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    2002             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    2003             :                                 eDstBufferDataType, 0, 0, nullptr);
    2004             :                         }
    2005        1576 :                         CPLFree(pDstBuffer);
    2006             :                     }
    2007             :                 }
    2008             : 
    2009         886 :                 nBlocksDone++;
    2010        1275 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    2011         389 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    2012             :                                              "", psExtraArg->pProgressData))
    2013             :                 {
    2014           0 :                     eErr = CE_Failure;
    2015             :                 }
    2016             :             }
    2017             :         }
    2018             : 
    2019         886 :         CPLFree(pChunk);
    2020         886 :         CPLFree(pabyChunkNoDataMask);
    2021             :     }
    2022             : 
    2023         886 :     CPLFree(papoDstBands);
    2024         886 :     GDALClose(poMEMDS);
    2025             : 
    2026         886 :     return eErr;
    2027             : }
    2028             : 
    2029             : //! @endcond
    2030             : 
    2031             : /************************************************************************/
    2032             : /*                           GDALSwapWords()                            */
    2033             : /************************************************************************/
    2034             : 
    2035             : /**
    2036             :  * Byte swap words in-place.
    2037             :  *
    2038             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2039             :  * a memory array.  No assumption is made that the words being swapped are
    2040             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2041             :  * to determine if the current platform is big endian or little endian.  Use
    2042             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2043             :  * of a function call.
    2044             :  *
    2045             :  * @param pData pointer to start of data buffer.
    2046             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2047             :  * @param nWordCount the number of words to be swapped in this call.
    2048             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2049             :  * the next. For packed buffers this is the same as nWordSize.
    2050             :  */
    2051             : 
    2052      497143 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2053             :                                int nWordSkip)
    2054             : 
    2055             : {
    2056      497143 :     if (nWordCount > 0)
    2057      497143 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2058             : 
    2059      497143 :     GByte *pabyData = static_cast<GByte *>(pData);
    2060             : 
    2061      497143 :     switch (nWordSize)
    2062             :     {
    2063        7234 :         case 1:
    2064        7234 :             break;
    2065             : 
    2066      476903 :         case 2:
    2067      476903 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2068   228062000 :             for (int i = 0; i < nWordCount; i++)
    2069             :             {
    2070   227585000 :                 CPL_SWAP16PTR(pabyData);
    2071   227585000 :                 pabyData += nWordSkip;
    2072             :             }
    2073      476903 :             break;
    2074             : 
    2075       10580 :         case 4:
    2076       10580 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2077       10580 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2078             :             {
    2079    29140500 :                 for (int i = 0; i < nWordCount; i++)
    2080             :                 {
    2081    29130000 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2082             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2083    29130000 :                     pabyData += nWordSkip;
    2084       10577 :                 }
    2085             :             }
    2086             :             else
    2087             :             {
    2088           9 :                 for (int i = 0; i < nWordCount; i++)
    2089             :                 {
    2090           6 :                     CPL_SWAP32PTR(pabyData);
    2091           6 :                     pabyData += nWordSkip;
    2092             :                 }
    2093             :             }
    2094       10580 :             break;
    2095             : 
    2096        2426 :         case 8:
    2097        2426 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2098        2426 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2099             :             {
    2100     3356900 :                 for (int i = 0; i < nWordCount; i++)
    2101             :                 {
    2102     3354480 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2103             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2104     3354480 :                     pabyData += nWordSkip;
    2105        2425 :                 }
    2106             :             }
    2107             :             else
    2108             :             {
    2109           3 :                 for (int i = 0; i < nWordCount; i++)
    2110             :                 {
    2111           2 :                     CPL_SWAP64PTR(pabyData);
    2112           2 :                     pabyData += nWordSkip;
    2113             :                 }
    2114             :             }
    2115        2426 :             break;
    2116             : 
    2117           0 :         default:
    2118           0 :             CPLAssert(false);
    2119             :     }
    2120             : }
    2121             : 
    2122             : /************************************************************************/
    2123             : /*                          GDALSwapWordsEx()                           */
    2124             : /************************************************************************/
    2125             : 
    2126             : /**
    2127             :  * Byte swap words in-place.
    2128             :  *
    2129             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2130             :  * a memory array.  No assumption is made that the words being swapped are
    2131             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2132             :  * to determine if the current platform is big endian or little endian.  Use
    2133             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2134             :  * of a function call.
    2135             :  *
    2136             :  * @param pData pointer to start of data buffer.
    2137             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2138             :  * @param nWordCount the number of words to be swapped in this call.
    2139             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2140             :  * the next. For packed buffers this is the same as nWordSize.
    2141             :  */
    2142        6124 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2143             :                                  int nWordSkip)
    2144             : {
    2145        6124 :     GByte *pabyData = static_cast<GByte *>(pData);
    2146       12248 :     while (nWordCount)
    2147             :     {
    2148             :         // Pick-up a multiple of 8 as max chunk size.
    2149        6124 :         const int nWordCountSmall =
    2150        6124 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2151        6124 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2152        6124 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2153        6124 :         nWordCount -= nWordCountSmall;
    2154             :     }
    2155        6124 : }
    2156             : 
    2157             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2158             : namespace
    2159             : {
    2160             : 
    2161             : /************************************************************************/
    2162             : /*                           GDALCopyWordsT()                           */
    2163             : /************************************************************************/
    2164             : /**
    2165             :  * Template function, used to copy data from pSrcData into buffer
    2166             :  * pDstData, with stride nSrcPixelStride in the source data and
    2167             :  * stride nDstPixelStride in the destination data. This template can
    2168             :  * deal with the case where the input data type is real or complex and
    2169             :  * the output is real.
    2170             :  *
    2171             :  * @param pSrcData the source data buffer
    2172             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2173             :  *                      of interest.
    2174             :  * @param pDstData the destination buffer.
    2175             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2176             :  *                      interest.
    2177             :  * @param nWordCount the total number of pixel words to copy
    2178             :  *
    2179             :  * @code
    2180             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2181             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2182             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2183             :  * @endcode
    2184             :  * @note
    2185             :  * This is a private function, and should not be exposed outside of
    2186             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2187             :  */
    2188             : 
    2189             : template <class Tin, class Tout>
    2190    49004022 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2191             :                                          int nSrcPixelStride,
    2192             :                                          Tout *const CPL_RESTRICT pDstData,
    2193             :                                          int nDstPixelStride,
    2194             :                                          GPtrDiff_t nWordCount)
    2195             : {
    2196    49004022 :     decltype(nWordCount) nDstOffset = 0;
    2197             : 
    2198    49004022 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2199    49004022 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2200   356635480 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2201             :     {
    2202   307631416 :         const Tin tValue =
    2203   307631416 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2204   307631416 :         Tout *const pOutPixel =
    2205   307631416 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2206             : 
    2207   307631416 :         GDALCopyWord(tValue, *pOutPixel);
    2208             : 
    2209   307631416 :         nDstOffset += nDstPixelStride;
    2210             :     }
    2211    49004022 : }
    2212             : 
    2213             : template <class Tin, class Tout>
    2214    29766045 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2215             :                                         int nSrcPixelStride,
    2216             :                                         Tout *const CPL_RESTRICT pDstData,
    2217             :                                         int nDstPixelStride,
    2218             :                                         GPtrDiff_t nWordCount)
    2219             : {
    2220    29766045 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2221             :                           nWordCount);
    2222    29766045 : }
    2223             : 
    2224             : template <class Tin, class Tout>
    2225     5094306 : static void inline GDALCopyWordsT_8atatime(
    2226             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2227             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2228             :     GPtrDiff_t nWordCount)
    2229             : {
    2230     5094306 :     decltype(nWordCount) nDstOffset = 0;
    2231             : 
    2232     5094306 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2233     5094306 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2234     5094306 :     decltype(nWordCount) n = 0;
    2235     5094306 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2236             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2237             :     {
    2238    57871497 :         for (; n < nWordCount - 7; n += 8)
    2239             :         {
    2240    57326656 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2241    57326656 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2242    57326656 :             Tout *const pOutPixels =
    2243    57326656 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2244             : 
    2245    57326656 :             GDALCopy8Words(pInValues, pOutPixels);
    2246             : 
    2247    57326656 :             nDstOffset += 8 * nDstPixelStride;
    2248             :         }
    2249             :     }
    2250    10491671 :     for (; n < nWordCount; n++)
    2251             :     {
    2252     5397365 :         const Tin tValue =
    2253     5397365 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2254     5397365 :         Tout *const pOutPixel =
    2255     5397365 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2256             : 
    2257     5397365 :         GDALCopyWord(tValue, *pOutPixel);
    2258             : 
    2259     5397365 :         nDstOffset += nDstPixelStride;
    2260             :     }
    2261     5094306 : }
    2262             : 
    2263             : #ifdef HAVE_SSE2
    2264             : 
    2265             : template <class Tout>
    2266     1042120 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2267             :                               int nSrcPixelStride,
    2268             :                               Tout *const CPL_RESTRICT pDstData,
    2269             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2270             : {
    2271             :     static_assert(std::is_integral<Tout>::value &&
    2272             :                       sizeof(Tout) == sizeof(uint16_t),
    2273             :                   "Bad Tout");
    2274     1042120 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2275             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2276             :     {
    2277       35766 :         decltype(nWordCount) n = 0;
    2278       35766 :         const __m128i xmm_zero = _mm_setzero_si128();
    2279       35766 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2280             :             reinterpret_cast<GByte *>(pDstData);
    2281     1478162 :         for (; n < nWordCount - 15; n += 16)
    2282             :         {
    2283     1442396 :             __m128i xmm = _mm_loadu_si128(
    2284     1442396 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2285     1442396 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2286     1442396 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2287             :             _mm_storeu_si128(
    2288     1442396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2289             :             _mm_storeu_si128(
    2290     1442396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2291             :         }
    2292      111789 :         for (; n < nWordCount; n++)
    2293             :         {
    2294       76023 :             pDstData[n] = pSrcData[n];
    2295       35766 :         }
    2296             :     }
    2297             :     else
    2298             :     {
    2299     1006351 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2300             :                               nDstPixelStride, nWordCount);
    2301             :     }
    2302     1042120 : }
    2303             : 
    2304             : template <>
    2305     1029380 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2306             :                                  int nSrcPixelStride,
    2307             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    2308             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2309             : {
    2310     1029380 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2311             :                              nDstPixelStride, nWordCount);
    2312     1029380 : }
    2313             : 
    2314             : template <>
    2315       12740 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2316             :                                  int nSrcPixelStride,
    2317             :                                  GInt16 *const CPL_RESTRICT pDstData,
    2318             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2319             : {
    2320       12740 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2321             :                              nDstPixelStride, nWordCount);
    2322       12740 : }
    2323             : 
    2324             : template <class Tout>
    2325    16237076 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2326             :                               int nSrcPixelStride,
    2327             :                               Tout *const CPL_RESTRICT pDstData,
    2328             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2329             : {
    2330             :     static_assert(std::is_integral<Tout>::value &&
    2331             :                       sizeof(Tout) == sizeof(uint32_t),
    2332             :                   "Bad Tout");
    2333    16237076 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2334             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2335             :     {
    2336     6532606 :         decltype(nWordCount) n = 0;
    2337     6532606 :         const __m128i xmm_zero = _mm_setzero_si128();
    2338     6532606 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2339             :             reinterpret_cast<GByte *>(pDstData);
    2340    74248627 :         for (; n < nWordCount - 15; n += 16)
    2341             :         {
    2342    67715961 :             __m128i xmm = _mm_loadu_si128(
    2343    67715961 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2344    67715961 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2345    67715961 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2346    67715961 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2347    67715961 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2348    67715961 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2349    67715961 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2350             :             _mm_storeu_si128(
    2351    67715961 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2352             :             _mm_storeu_si128(
    2353    67715961 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2354             :             _mm_storeu_si128(
    2355    67715961 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2356             :             _mm_storeu_si128(
    2357    67715961 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2358             :         }
    2359    14825316 :         for (; n < nWordCount; n++)
    2360             :         {
    2361     8292760 :             pDstData[n] = pSrcData[n];
    2362     6532606 :         }
    2363             :     }
    2364             :     else
    2365             :     {
    2366     9704490 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2367             :                               nDstPixelStride, nWordCount);
    2368             :     }
    2369    16237076 : }
    2370             : 
    2371             : template <>
    2372         476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2373             :                                  int nSrcPixelStride,
    2374             :                                  GUInt32 *const CPL_RESTRICT pDstData,
    2375             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2376             : {
    2377         476 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2378             :                              nDstPixelStride, nWordCount);
    2379         476 : }
    2380             : 
    2381             : template <>
    2382    16236600 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2383             :                                  int nSrcPixelStride,
    2384             :                                  GInt32 *const CPL_RESTRICT pDstData,
    2385             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2386             : {
    2387    16236600 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2388             :                              nDstPixelStride, nWordCount);
    2389    16236600 : }
    2390             : 
    2391             : template <>
    2392     2856070 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2393             :                                  int nSrcPixelStride,
    2394             :                                  float *const CPL_RESTRICT pDstData,
    2395             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2396             : {
    2397     2856070 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2398             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2399             :     {
    2400      233183 :         decltype(nWordCount) n = 0;
    2401      233183 :         const __m128i xmm_zero = _mm_setzero_si128();
    2402      233183 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2403             :             reinterpret_cast<GByte *>(pDstData);
    2404     4780370 :         for (; n < nWordCount - 15; n += 16)
    2405             :         {
    2406     4547190 :             __m128i xmm = _mm_loadu_si128(
    2407     4547190 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2408     4547190 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2409     4547190 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2410     4547190 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2411     4547190 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2412     4547190 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2413     4547190 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2414     4547190 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2415     4547190 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2416     4547190 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2417     4547190 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2418     4547190 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2419             :                           xmm0_f);
    2420             :             _mm_storeu_ps(
    2421     4547190 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2422             :             _mm_storeu_ps(
    2423     4547190 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2424             :             _mm_storeu_ps(
    2425     4547190 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2426             :         }
    2427      957106 :         for (; n < nWordCount; n++)
    2428             :         {
    2429      723923 :             pDstData[n] = pSrcData[n];
    2430      233183 :         }
    2431             :     }
    2432             :     else
    2433             :     {
    2434     2622880 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2435             :                               nDstPixelStride, nWordCount);
    2436             :     }
    2437     2856070 : }
    2438             : 
    2439             : template <>
    2440      170733 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2441             :                                  int nSrcPixelStride,
    2442             :                                  double *const CPL_RESTRICT pDstData,
    2443             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2444             : {
    2445      170733 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2446             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2447             :     {
    2448      146935 :         decltype(nWordCount) n = 0;
    2449      146935 :         const __m128i xmm_zero = _mm_setzero_si128();
    2450      146935 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2451             :             reinterpret_cast<GByte *>(pDstData);
    2452     3126930 :         for (; n < nWordCount - 15; n += 16)
    2453             :         {
    2454     2979990 :             __m128i xmm = _mm_loadu_si128(
    2455     2979990 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2456     2979990 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2457     2979990 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2458     2979990 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2459     2979990 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2460     2979990 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2461     2979990 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2462             : 
    2463             : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
    2464             :             _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2465             :                              _mm256_cvtepi32_pd(xmm0));
    2466             :             _mm256_storeu_pd(
    2467             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2468             :                 _mm256_cvtepi32_pd(xmm1));
    2469             :             _mm256_storeu_pd(
    2470             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2471             :                 _mm256_cvtepi32_pd(xmm2));
    2472             :             _mm256_storeu_pd(
    2473             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2474             :                 _mm256_cvtepi32_pd(xmm3));
    2475             : #else
    2476     2979990 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2477     2979990 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2478     2979990 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2479     2979990 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2480     2979990 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2481     2979990 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2482     2979990 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2483     2979990 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2484     2979990 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2485     2979990 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2486     2979990 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2487     2979990 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2488             : 
    2489     2979990 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2490             :                           xmm0_low_d);
    2491             :             _mm_storeu_pd(
    2492     2979990 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2493             :                 xmm0_high_d);
    2494             :             _mm_storeu_pd(
    2495     2979990 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2496             :                 xmm1_low_d);
    2497             :             _mm_storeu_pd(
    2498     2979990 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2499             :                 xmm1_high_d);
    2500             :             _mm_storeu_pd(
    2501     2979990 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2502             :                 xmm2_low_d);
    2503             :             _mm_storeu_pd(
    2504     2979990 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2505             :                 xmm2_high_d);
    2506             :             _mm_storeu_pd(
    2507     2979990 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2508             :                 xmm3_low_d);
    2509             :             _mm_storeu_pd(
    2510     2979990 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2511             :                 xmm3_high_d);
    2512             : #endif
    2513             :         }
    2514      280278 :         for (; n < nWordCount; n++)
    2515             :         {
    2516      133343 :             pDstData[n] = pSrcData[n];
    2517      146935 :         }
    2518             :     }
    2519             :     else
    2520             :     {
    2521       23798 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2522             :                               nDstPixelStride, nWordCount);
    2523             :     }
    2524      170733 : }
    2525             : 
    2526             : template <>
    2527         148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
    2528             :                                  int nSrcPixelStride,
    2529             :                                  int8_t *const CPL_RESTRICT pDstData,
    2530             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2531             : {
    2532         148 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2533             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2534             :     {
    2535         142 :         decltype(nWordCount) n = 0;
    2536         142 :         const __m128i xmm_127 = _mm_set1_epi8(127);
    2537         146 :         for (; n < nWordCount - 31; n += 32)
    2538             :         {
    2539           8 :             __m128i xmm0 = _mm_loadu_si128(
    2540           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2541           4 :             __m128i xmm1 = _mm_loadu_si128(
    2542           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2543           4 :             xmm0 = _mm_min_epu8(xmm0, xmm_127);
    2544           4 :             xmm1 = _mm_min_epu8(xmm1, xmm_127);
    2545           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2546           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2547             :                              xmm1);
    2548             :         }
    2549        2424 :         for (; n < nWordCount; n++)
    2550             :         {
    2551        2282 :             pDstData[n] =
    2552        2282 :                 pSrcData[n] >= 127 ? 127 : static_cast<int8_t>(pSrcData[n]);
    2553         142 :         }
    2554             :     }
    2555             :     else
    2556             :     {
    2557           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2558             :                               nDstPixelStride, nWordCount);
    2559             :     }
    2560         148 : }
    2561             : 
    2562             : template <>
    2563          82 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
    2564             :                                  int nSrcPixelStride,
    2565             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2566             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2567             : {
    2568          82 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2569             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2570             :     {
    2571          56 :         decltype(nWordCount) n = 0;
    2572             : #if !(defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS))
    2573          56 :         const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
    2574             : #endif
    2575         117 :         for (; n < nWordCount - 31; n += 32)
    2576             :         {
    2577         122 :             __m128i xmm0 = _mm_loadu_si128(
    2578          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2579          61 :             __m128i xmm1 = _mm_loadu_si128(
    2580          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2581             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2582             :             xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
    2583             :             xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
    2584             : #else
    2585          61 :             xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
    2586          61 :             xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
    2587          61 :             xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
    2588          61 :             xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
    2589          61 :             xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
    2590          61 :             xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
    2591             : #endif
    2592          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2593          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2594             :                              xmm1);
    2595             :         }
    2596         352 :         for (; n < nWordCount; n++)
    2597             :         {
    2598         296 :             pDstData[n] =
    2599         296 :                 pSrcData[n] < 0 ? 0 : static_cast<uint8_t>(pSrcData[n]);
    2600          56 :         }
    2601             :     }
    2602             :     else
    2603             :     {
    2604          26 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2605             :                               nDstPixelStride, nWordCount);
    2606             :     }
    2607          82 : }
    2608             : 
    2609             : template <>
    2610        6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2611             :                                  int nSrcPixelStride,
    2612             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2613             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2614             : {
    2615        6037 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2616             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2617             :     {
    2618        5062 :         decltype(nWordCount) n = 0;
    2619             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2620             :         const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2621             : #else
    2622             :         // In SSE2, min_epu16 does not exist, so shift from
    2623             :         // UInt16 to SInt16 to be able to use min_epi16
    2624        5062 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2625        5062 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2626             : #endif
    2627       71888 :         for (; n < nWordCount - 15; n += 16)
    2628             :         {
    2629      133652 :             __m128i xmm0 = _mm_loadu_si128(
    2630       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2631       66826 :             __m128i xmm1 = _mm_loadu_si128(
    2632       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2633             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2634             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2635             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2636             : #else
    2637       66826 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2638       66826 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2639       66826 :             xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
    2640       66826 :             xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
    2641       66826 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2642       66826 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2643             : #endif
    2644       66826 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    2645       66826 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2646             :         }
    2647       16403 :         for (; n < nWordCount; n++)
    2648             :         {
    2649       11341 :             pDstData[n] =
    2650       11341 :                 pSrcData[n] >= 255 ? 255 : static_cast<uint8_t>(pSrcData[n]);
    2651        5062 :         }
    2652             :     }
    2653             :     else
    2654             :     {
    2655         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2656             :                               nDstPixelStride, nWordCount);
    2657             :     }
    2658        6037 : }
    2659             : 
    2660             : template <>
    2661          46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2662             :                                  int nSrcPixelStride,
    2663             :                                  int16_t *const CPL_RESTRICT pDstData,
    2664             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2665             : {
    2666          46 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2667             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2668             :     {
    2669          40 :         decltype(nWordCount) n = 0;
    2670             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2671             :         const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2672             : #else
    2673             :         // In SSE2, min_epu16 does not exist, so shift from
    2674             :         // UInt16 to SInt16 to be able to use min_epi16
    2675          40 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2676          40 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2677             : #endif
    2678         169 :         for (; n < nWordCount - 15; n += 16)
    2679             :         {
    2680         258 :             __m128i xmm0 = _mm_loadu_si128(
    2681         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2682         129 :             __m128i xmm1 = _mm_loadu_si128(
    2683         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2684             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2685             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2686             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2687             : #else
    2688         129 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2689         129 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2690         129 :             xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
    2691         129 :             xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
    2692         129 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2693         129 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2694             : #endif
    2695         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2696         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2697             :                              xmm1);
    2698             :         }
    2699         191 :         for (; n < nWordCount; n++)
    2700             :         {
    2701         282 :             pDstData[n] = pSrcData[n] >= 32767
    2702             :                               ? 32767
    2703         131 :                               : static_cast<int16_t>(pSrcData[n]);
    2704          40 :         }
    2705             :     }
    2706             :     else
    2707             :     {
    2708           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2709             :                               nDstPixelStride, nWordCount);
    2710             :     }
    2711          46 : }
    2712             : 
    2713             : template <>
    2714         136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2715             :                                  int nSrcPixelStride,
    2716             :                                  uint16_t *const CPL_RESTRICT pDstData,
    2717             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2718             : {
    2719         136 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2720             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2721             :     {
    2722          93 :         decltype(nWordCount) n = 0;
    2723          93 :         const __m128i xmm_zero = _mm_setzero_si128();
    2724         278 :         for (; n < nWordCount - 15; n += 16)
    2725             :         {
    2726         370 :             __m128i xmm0 = _mm_loadu_si128(
    2727         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2728         185 :             __m128i xmm1 = _mm_loadu_si128(
    2729         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2730         185 :             xmm0 = _mm_max_epi16(xmm0, xmm_zero);
    2731         185 :             xmm1 = _mm_max_epi16(xmm1, xmm_zero);
    2732         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2733         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2734             :                              xmm1);
    2735             :         }
    2736         471 :         for (; n < nWordCount; n++)
    2737             :         {
    2738         378 :             pDstData[n] =
    2739         378 :                 pSrcData[n] < 0 ? 0 : static_cast<uint16_t>(pSrcData[n]);
    2740          93 :         }
    2741             :     }
    2742             :     else
    2743             :     {
    2744          43 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2745             :                               nDstPixelStride, nWordCount);
    2746             :     }
    2747         136 : }
    2748             : 
    2749             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2750             : 
    2751             : template <>
    2752             : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
    2753             :                                  int nSrcPixelStride,
    2754             :                                  int32_t *const CPL_RESTRICT pDstData,
    2755             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2756             : {
    2757             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2758             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2759             :     {
    2760             :         decltype(nWordCount) n = 0;
    2761             :         const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
    2762             :         for (; n < nWordCount - 8; n += 7)
    2763             :         {
    2764             :             __m128i xmm0 = _mm_loadu_si128(
    2765             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2766             :             __m128i xmm1 = _mm_loadu_si128(
    2767             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2768             :             xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
    2769             :             xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
    2770             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2771             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2772             :                              xmm1);
    2773             :         }
    2774             :         for (; n < nWordCount; n++)
    2775             :         {
    2776             :             pDstData[n] = pSrcData[n] >= INT_MAX
    2777             :                               ? INT_MAX
    2778             :                               : static_cast<int32_t>(pSrcData[n]);
    2779             :         }
    2780             :     }
    2781             :     else
    2782             :     {
    2783             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2784             :                               nDstPixelStride, nWordCount);
    2785             :     }
    2786             : }
    2787             : 
    2788             : template <>
    2789             : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    2790             :                                  int nSrcPixelStride,
    2791             :                                  uint32_t *const CPL_RESTRICT pDstData,
    2792             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2793             : {
    2794             :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2795             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2796             :     {
    2797             :         decltype(nWordCount) n = 0;
    2798             :         const __m128i xmm_zero = _mm_setzero_si128();
    2799             :         for (; n < nWordCount - 7; n += 8)
    2800             :         {
    2801             :             __m128i xmm0 = _mm_loadu_si128(
    2802             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2803             :             __m128i xmm1 = _mm_loadu_si128(
    2804             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2805             :             xmm0 = _mm_max_epi32(xmm0, xmm_zero);
    2806             :             xmm1 = _mm_max_epi32(xmm1, xmm_zero);
    2807             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2808             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2809             :                              xmm1);
    2810             :         }
    2811             :         for (; n < nWordCount; n++)
    2812             :         {
    2813             :             pDstData[n] =
    2814             :                 pSrcData[n] < 0 ? 0 : static_cast<uint32_t>(pSrcData[n]);
    2815             :         }
    2816             :     }
    2817             :     else
    2818             :     {
    2819             :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2820             :                               nDstPixelStride, nWordCount);
    2821             :     }
    2822             : }
    2823             : 
    2824             : #endif  // defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    2825             : 
    2826             : template <>
    2827         343 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2828             :                                  int nSrcPixelStride,
    2829             :                                  float *const CPL_RESTRICT pDstData,
    2830             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2831             : {
    2832         343 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2833             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2834             :     {
    2835         337 :         decltype(nWordCount) n = 0;
    2836         337 :         const __m128i xmm_zero = _mm_setzero_si128();
    2837         337 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2838             :             reinterpret_cast<GByte *>(pDstData);
    2839        1508 :         for (; n < nWordCount - 7; n += 8)
    2840             :         {
    2841        1171 :             __m128i xmm = _mm_loadu_si128(
    2842        1171 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2843        1171 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2844        1171 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2845        1171 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2846        1171 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2847        1171 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2848             :                           xmm0_f);
    2849             :             _mm_storeu_ps(
    2850        1171 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2851             :         }
    2852        1115 :         for (; n < nWordCount; n++)
    2853             :         {
    2854         778 :             pDstData[n] = pSrcData[n];
    2855         337 :         }
    2856             :     }
    2857             :     else
    2858             :     {
    2859           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2860             :                               nDstPixelStride, nWordCount);
    2861             :     }
    2862         343 : }
    2863             : 
    2864             : template <>
    2865     1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2866             :                                  int nSrcPixelStride,
    2867             :                                  float *const CPL_RESTRICT pDstData,
    2868             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2869             : {
    2870     1076640 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2871             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2872             :     {
    2873       86739 :         decltype(nWordCount) n = 0;
    2874       86739 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2875             :             reinterpret_cast<GByte *>(pDstData);
    2876      586116 :         for (; n < nWordCount - 7; n += 8)
    2877             :         {
    2878      499377 :             __m128i xmm = _mm_loadu_si128(
    2879      499377 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2880      499377 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2881      499377 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2882      499377 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2883      499377 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2884      499377 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2885      499377 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2886             :                           xmm0_f);
    2887             :             _mm_storeu_ps(
    2888      499377 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2889             :         }
    2890      253879 :         for (; n < nWordCount; n++)
    2891             :         {
    2892      167140 :             pDstData[n] = pSrcData[n];
    2893       86739 :         }
    2894             :     }
    2895             :     else
    2896             :     {
    2897      989901 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2898             :                               nDstPixelStride, nWordCount);
    2899             :     }
    2900     1076640 : }
    2901             : 
    2902             : template <>
    2903         449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2904             :                                  int nSrcPixelStride,
    2905             :                                  double *const CPL_RESTRICT pDstData,
    2906             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2907             : {
    2908         449 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2909             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2910             :     {
    2911         313 :         decltype(nWordCount) n = 0;
    2912         313 :         const __m128i xmm_zero = _mm_setzero_si128();
    2913         313 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2914             :             reinterpret_cast<GByte *>(pDstData);
    2915         829 :         for (; n < nWordCount - 7; n += 8)
    2916             :         {
    2917         516 :             __m128i xmm = _mm_loadu_si128(
    2918         516 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2919         516 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2920         516 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2921             : 
    2922         516 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2923         516 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2924         516 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2925         516 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2926         516 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2927         516 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2928             : 
    2929         516 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2930             :                           xmm0_low_d);
    2931             :             _mm_storeu_pd(
    2932         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2933             :                 xmm0_high_d);
    2934             :             _mm_storeu_pd(
    2935         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2936             :                 xmm1_low_d);
    2937             :             _mm_storeu_pd(
    2938         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2939             :                 xmm1_high_d);
    2940             :         }
    2941        1082 :         for (; n < nWordCount; n++)
    2942             :         {
    2943         769 :             pDstData[n] = pSrcData[n];
    2944         313 :         }
    2945             :     }
    2946             :     else
    2947             :     {
    2948         136 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2949             :                               nDstPixelStride, nWordCount);
    2950             :     }
    2951         449 : }
    2952             : 
    2953             : template <>
    2954     4923020 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2955             :                                  int nSrcPixelStride,
    2956             :                                  double *const CPL_RESTRICT pDstData,
    2957             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2958             : {
    2959     4923020 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2960             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2961             :     {
    2962       34621 :         decltype(nWordCount) n = 0;
    2963       34621 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2964             :             reinterpret_cast<GByte *>(pDstData);
    2965      403422 :         for (; n < nWordCount - 7; n += 8)
    2966             :         {
    2967      368801 :             __m128i xmm = _mm_loadu_si128(
    2968      368801 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2969      368801 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2970      368801 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2971      368801 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2972             : 
    2973      368801 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2974      368801 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2975      368801 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2976      368801 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2977      368801 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2978      368801 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2979             : 
    2980      368801 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2981             :                           xmm0_low_d);
    2982             :             _mm_storeu_pd(
    2983      368801 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2984             :                 xmm0_high_d);
    2985             :             _mm_storeu_pd(
    2986      368801 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2987             :                 xmm1_low_d);
    2988             :             _mm_storeu_pd(
    2989      368801 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2990             :                 xmm1_high_d);
    2991             :         }
    2992      254691 :         for (; n < nWordCount; n++)
    2993             :         {
    2994      220070 :             pDstData[n] = pSrcData[n];
    2995       34621 :         }
    2996             :     }
    2997             :     else
    2998             :     {
    2999     4888400 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3000             :                               nDstPixelStride, nWordCount);
    3001             :     }
    3002     4923020 : }
    3003             : 
    3004             : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
    3005             : 
    3006             : #if defined(HAVE_AVX2_DISPATCH) || defined(HAVE_AVX2_NATIVELY)
    3007             : #if defined(HAVE_AVX2_DISPATCH) && !defined(HAVE_AVX2_DISPATCH_MSVC)
    3008             : __attribute__((target("avx2")))
    3009             : #endif
    3010       12727 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
    3011             :                                            uint8_t *CPL_RESTRICT pDst,
    3012             :                                            GPtrDiff_t nWordCount)
    3013             : {
    3014       12727 :     const __m256i ymm_zero = _mm256_setzero_si256();
    3015       12727 :     const __m256i ymm_255 = _mm256_set1_epi32(255);
    3016       12727 :     const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
    3017       12727 :     GPtrDiff_t n = 0;
    3018      958123 :     for (; n < nWordCount - 31; n += 32)
    3019             :     {
    3020             :         __m256i v0 =
    3021      945396 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
    3022             :         __m256i v1 =
    3023      945396 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
    3024      945396 :         __m256i v2 = _mm256_loadu_si256(
    3025      945396 :             reinterpret_cast<const __m256i *>(pSrc + n + 16));
    3026      945396 :         __m256i v3 = _mm256_loadu_si256(
    3027      945396 :             reinterpret_cast<const __m256i *>(pSrc + n + 24));
    3028             :         // Clamp to [0, 255]
    3029      945396 :         v0 = _mm256_max_epi32(v0, ymm_zero);
    3030      945396 :         v1 = _mm256_max_epi32(v1, ymm_zero);
    3031      945396 :         v2 = _mm256_max_epi32(v2, ymm_zero);
    3032      945396 :         v3 = _mm256_max_epi32(v3, ymm_zero);
    3033      945396 :         v0 = _mm256_min_epi32(v0, ymm_255);
    3034      945396 :         v1 = _mm256_min_epi32(v1, ymm_255);
    3035      945396 :         v2 = _mm256_min_epi32(v2, ymm_255);
    3036      945396 :         v3 = _mm256_min_epi32(v3, ymm_255);
    3037             :         // Pack int32→int16→uint8, then fix cross-lane ordering
    3038      945396 :         __m256i ab16 = _mm256_packs_epi32(v0, v1);
    3039      945396 :         __m256i cd16 = _mm256_packs_epi32(v2, v3);
    3040      945396 :         __m256i bytes = _mm256_packus_epi16(ab16, cd16);
    3041      945396 :         bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
    3042      945396 :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
    3043             :     }
    3044       68601 :     for (; n < nWordCount; n++)
    3045             :     {
    3046       70969 :         pDst[n] = pSrc[n] <= 0     ? 0
    3047       15095 :                   : pSrc[n] >= 255 ? 255
    3048        1079 :                                    : static_cast<uint8_t>(pSrc[n]);
    3049             :     }
    3050       12727 : }
    3051             : #endif  // HAVE_AVX2_DISPATCH || HAVE_AVX2_NATIVELY
    3052             : 
    3053             : #if defined(HAVE_AVX2_DISPATCH) || defined(HAVE_AVX2_NATIVELY)
    3054             : #if defined(HAVE_AVX2_DISPATCH) && !defined(HAVE_AVX2_DISPATCH_MSVC)
    3055             : __attribute__((target("avx2")))
    3056             : #endif
    3057       10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
    3058             :                                             uint16_t *CPL_RESTRICT pDst,
    3059             :                                             GPtrDiff_t nWordCount)
    3060             : {
    3061       10277 :     const __m256i ymm_zero = _mm256_setzero_si256();
    3062             :     // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
    3063             :     //   [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
    3064             :     // Permute to deinterleave: all v0 values first, then all v1 values
    3065       10277 :     const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
    3066       10277 :     GPtrDiff_t n = 0;
    3067      670572 :     for (; n < nWordCount - 15; n += 16)
    3068             :     {
    3069             :         __m256i v0 =
    3070      660295 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
    3071             :         __m256i v1 =
    3072     1320590 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
    3073             :         // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
    3074      660295 :         v0 = _mm256_max_epi32(v0, ymm_zero);
    3075      660295 :         v1 = _mm256_max_epi32(v1, ymm_zero);
    3076      660295 :         __m256i packed = _mm256_packus_epi32(v0, v1);
    3077             :         // Fix cross-lane interleave from packus
    3078      660295 :         packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
    3079      660295 :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
    3080             :     }
    3081      163928 :     for (; n < nWordCount; n++)
    3082             :     {
    3083      307282 :         pDst[n] = pSrc[n] <= 0       ? 0
    3084      153631 :                   : pSrc[n] >= 65535 ? 65535
    3085      153599 :                                      : static_cast<uint16_t>(pSrc[n]);
    3086             :     }
    3087       10277 : }
    3088             : #endif  // HAVE_AVX2_DISPATCH || HAVE_AVX2_NATIVELY
    3089             : 
    3090             : // ---- int32 -> uint8 with clamping to [0, 255] ----
    3091             : template <>
    3092       13641 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3093             :                                  int nSrcPixelStride,
    3094             :                                  uint8_t *const CPL_RESTRICT pDstData,
    3095             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3096             : {
    3097       13641 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3098             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3099             :     {
    3100             : #if defined(HAVE_AVX2_DISPATCH)
    3101       12727 :         if (CPLHaveRuntimeAVX2())
    3102             :         {
    3103       12727 :             GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
    3104       12727 :             return;
    3105             :         }
    3106             : #elif defined(HAVE_AVX2_NATIVELY)
    3107             :         GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
    3108             :         return;
    3109             : #endif
    3110             : #ifdef HAVE_SSE2
    3111             :         // SSE2 path: 16 pixels per iteration
    3112           0 :         decltype(nWordCount) n = 0;
    3113           0 :         const __m128i xmm_255 = _mm_set1_epi32(255);
    3114           0 :         for (; n < nWordCount - 15; n += 16)
    3115             :         {
    3116           0 :             __m128i v0 = _mm_loadu_si128(
    3117           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3118           0 :             __m128i v1 = _mm_loadu_si128(
    3119           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3120           0 :             __m128i v2 = _mm_loadu_si128(
    3121           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    3122           0 :             __m128i v3 = _mm_loadu_si128(
    3123           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 12));
    3124             :             // Clamp to [0, 255] using SSE2 arithmetic:
    3125             :             // max(v, 0): zero out negatives via sign bit mask
    3126           0 :             v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
    3127           0 :             v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
    3128           0 :             v2 = _mm_andnot_si128(_mm_srai_epi32(v2, 31), v2);
    3129           0 :             v3 = _mm_andnot_si128(_mm_srai_epi32(v3, 31), v3);
    3130             :             // min(v, 255): blend 255 where v > 255
    3131           0 :             __m128i gt0 = _mm_cmpgt_epi32(v0, xmm_255);
    3132           0 :             __m128i gt1 = _mm_cmpgt_epi32(v1, xmm_255);
    3133           0 :             __m128i gt2 = _mm_cmpgt_epi32(v2, xmm_255);
    3134           0 :             __m128i gt3 = _mm_cmpgt_epi32(v3, xmm_255);
    3135           0 :             v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
    3136             :                               _mm_and_si128(gt0, xmm_255));
    3137           0 :             v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
    3138             :                               _mm_and_si128(gt1, xmm_255));
    3139           0 :             v2 = _mm_or_si128(_mm_andnot_si128(gt2, v2),
    3140             :                               _mm_and_si128(gt2, xmm_255));
    3141           0 :             v3 = _mm_or_si128(_mm_andnot_si128(gt3, v3),
    3142             :                               _mm_and_si128(gt3, xmm_255));
    3143             :             // Values in [0, 255]: pack int32→int16→uint8
    3144           0 :             __m128i lo16 = _mm_packs_epi32(v0, v1);
    3145           0 :             __m128i hi16 = _mm_packs_epi32(v2, v3);
    3146           0 :             __m128i bytes = _mm_packus_epi16(lo16, hi16);
    3147           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
    3148             :         }
    3149           0 :         for (; n < nWordCount; n++)
    3150             : #else
    3151             :         for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3152             : #endif
    3153             :         {
    3154           0 :             pDstData[n] = pSrcData[n] <= 0 ? 0
    3155           0 :                           : pSrcData[n] >= 255
    3156             :                               ? 255
    3157           0 :                               : static_cast<uint8_t>(pSrcData[n]);
    3158           0 :         }
    3159             :     }
    3160             :     else
    3161             :     {
    3162         914 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3163             :                               nDstPixelStride, nWordCount);
    3164             :     }
    3165             : }
    3166             : 
    3167             : // ---- int32 -> uint16 with clamping to [0, 65535] ----
    3168             : template <>
    3169       10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3170             :                                  int nSrcPixelStride,
    3171             :                                  uint16_t *const CPL_RESTRICT pDstData,
    3172             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3173             : {
    3174       10322 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3175             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3176             :     {
    3177             : #if defined(HAVE_AVX2_DISPATCH)
    3178       10277 :         if (CPLHaveRuntimeAVX2())
    3179             :         {
    3180       10277 :             GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
    3181       10277 :             return;
    3182             :         }
    3183             : #elif defined(HAVE_AVX2_NATIVELY)
    3184             :         GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
    3185             :         return;
    3186             : #endif
    3187           0 :         decltype(nWordCount) n = 0;
    3188             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    3189             :         // SSE4.1: _mm_packus_epi32 directly handles uint saturation
    3190             :         for (; n < nWordCount - 7; n += 8)
    3191             :         {
    3192             :             __m128i v0 = _mm_loadu_si128(
    3193             :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3194             :             __m128i v1 = _mm_loadu_si128(
    3195             :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3196             :             v0 = _mm_max_epi32(v0, _mm_setzero_si128());
    3197             :             v1 = _mm_max_epi32(v1, _mm_setzero_si128());
    3198             :             __m128i packed = _mm_packus_epi32(v0, v1);
    3199             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
    3200             :         }
    3201             : #else
    3202             :         // SSE2: clamp to [0, 65535], bias to signed range, pack, unbias
    3203           0 :         const __m128i xmm_65535 = _mm_set1_epi32(65535);
    3204           0 :         const __m128i xmm_bias32 = _mm_set1_epi32(32768);
    3205           0 :         const __m128i xmm_bias16 = _mm_set1_epi16(-32768);
    3206           0 :         for (; n < nWordCount - 7; n += 8)
    3207             :         {
    3208           0 :             __m128i v0 = _mm_loadu_si128(
    3209           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3210           0 :             __m128i v1 = _mm_loadu_si128(
    3211           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3212             :             // max(v, 0)
    3213           0 :             v0 = _mm_andnot_si128(_mm_srai_epi32(v0, 31), v0);
    3214           0 :             v1 = _mm_andnot_si128(_mm_srai_epi32(v1, 31), v1);
    3215             :             // min(v, 65535)
    3216           0 :             __m128i gt0 = _mm_cmpgt_epi32(v0, xmm_65535);
    3217           0 :             __m128i gt1 = _mm_cmpgt_epi32(v1, xmm_65535);
    3218           0 :             v0 = _mm_or_si128(_mm_andnot_si128(gt0, v0),
    3219             :                               _mm_and_si128(gt0, xmm_65535));
    3220           0 :             v1 = _mm_or_si128(_mm_andnot_si128(gt1, v1),
    3221             :                               _mm_and_si128(gt1, xmm_65535));
    3222             :             // Shift [0, 65535] → [-32768, 32767] for _mm_packs_epi32
    3223           0 :             v0 = _mm_sub_epi32(v0, xmm_bias32);
    3224           0 :             v1 = _mm_sub_epi32(v1, xmm_bias32);
    3225           0 :             __m128i packed = _mm_packs_epi32(v0, v1);
    3226             :             // Shift back: sub_epi16(x, -32768) == add 32768 (mod 2^16)
    3227           0 :             packed = _mm_sub_epi16(packed, xmm_bias16);
    3228           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), packed);
    3229             :         }
    3230             : #endif
    3231           0 :         for (; n < nWordCount; n++)
    3232             :         {
    3233           0 :             pDstData[n] = pSrcData[n] <= 0 ? 0
    3234           0 :                           : pSrcData[n] >= 65535
    3235             :                               ? 65535
    3236           0 :                               : static_cast<uint16_t>(pSrcData[n]);
    3237           0 :         }
    3238             :     }
    3239             :     else
    3240             :     {
    3241          45 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3242             :                               nDstPixelStride, nWordCount);
    3243             :     }
    3244             : }
    3245             : 
    3246             : #endif  // HAVE_SSE2
    3247             : 
    3248             : template <>
    3249     4436800 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3250             :                                  int nSrcPixelStride,
    3251             :                                  GByte *const CPL_RESTRICT pDstData,
    3252             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3253             : {
    3254     4436800 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3255             :                             nDstPixelStride, nWordCount);
    3256     4436800 : }
    3257             : 
    3258             : template <>
    3259       38387 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3260             :                                  int nSrcPixelStride,
    3261             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3262             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3263             : {
    3264       38387 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3265             :                             nDstPixelStride, nWordCount);
    3266       38387 : }
    3267             : 
    3268             : template <>
    3269       55620 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3270             :                                  int nSrcPixelStride,
    3271             :                                  double *const CPL_RESTRICT pDstData,
    3272             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3273             : {
    3274       55620 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3275             :                             nDstPixelStride, nWordCount);
    3276       55620 : }
    3277             : 
    3278             : template <>
    3279      122818 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3280             :                                  int nSrcPixelStride,
    3281             :                                  float *const CPL_RESTRICT pDstData,
    3282             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3283             : {
    3284      122818 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3285             :                             nDstPixelStride, nWordCount);
    3286      122818 : }
    3287             : 
    3288             : template <>
    3289         412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3290             :                                  int nSrcPixelStride,
    3291             :                                  float *const CPL_RESTRICT pDstData,
    3292             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3293             : {
    3294         412 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3295             :                             nDstPixelStride, nWordCount);
    3296         412 : }
    3297             : 
    3298             : template <>
    3299         544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3300             :                                  int nSrcPixelStride,
    3301             :                                  double *const CPL_RESTRICT pDstData,
    3302             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3303             : {
    3304         544 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3305             :                             nDstPixelStride, nWordCount);
    3306         544 : }
    3307             : 
    3308             : template <>
    3309      318163 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3310             :                                  int nSrcPixelStride,
    3311             :                                  GByte *const CPL_RESTRICT pDstData,
    3312             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3313             : {
    3314      318163 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3315             :                             nDstPixelStride, nWordCount);
    3316      318163 : }
    3317             : 
    3318             : template <>
    3319          55 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3320             :                                  int nSrcPixelStride,
    3321             :                                  GInt8 *const CPL_RESTRICT pDstData,
    3322             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3323             : {
    3324          55 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3325             :                             nDstPixelStride, nWordCount);
    3326          55 : }
    3327             : 
    3328             : template <>
    3329       15775 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3330             :                                  int nSrcPixelStride,
    3331             :                                  GInt16 *const CPL_RESTRICT pDstData,
    3332             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3333             : {
    3334       15775 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3335             :                             nDstPixelStride, nWordCount);
    3336       15775 : }
    3337             : 
    3338             : template <>
    3339       61713 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3340             :                                  int nSrcPixelStride,
    3341             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3342             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3343             : {
    3344       61713 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3345             :                             nDstPixelStride, nWordCount);
    3346       61713 : }
    3347             : 
    3348             : template <>
    3349       43884 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3350             :                                  int nSrcPixelStride,
    3351             :                                  GInt32 *const CPL_RESTRICT pDstData,
    3352             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3353             : {
    3354       43884 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3355             :                             nDstPixelStride, nWordCount);
    3356       43884 : }
    3357             : 
    3358             : template <>
    3359          72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3360             :                                  int nSrcPixelStride,
    3361             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3362             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3363             : {
    3364          72 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3365             :                             nDstPixelStride, nWordCount);
    3366          72 : }
    3367             : 
    3368             : template <>
    3369          63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3370             :                                  int nSrcPixelStride,
    3371             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3372             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3373             : {
    3374          63 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3375             :                             nDstPixelStride, nWordCount);
    3376          63 : }
    3377             : 
    3378             : /************************************************************************/
    3379             : /*                       GDALCopyWordsComplexT()                        */
    3380             : /************************************************************************/
    3381             : /**
    3382             :  * Template function, used to copy data from pSrcData into buffer
    3383             :  * pDstData, with stride nSrcPixelStride in the source data and
    3384             :  * stride nDstPixelStride in the destination data. Deals with the
    3385             :  * complex case, where input is complex and output is complex.
    3386             :  *
    3387             :  * @param pSrcData the source data buffer
    3388             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3389             :  *                      of interest.
    3390             :  * @param pDstData the destination buffer.
    3391             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3392             :  *                      interest.
    3393             :  * @param nWordCount the total number of pixel words to copy
    3394             :  *
    3395             :  */
    3396             : template <class Tin, class Tout>
    3397       98787 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    3398             :                                   int nSrcPixelStride,
    3399             :                                   Tout *const CPL_RESTRICT pDstData,
    3400             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    3401             : {
    3402       98787 :     decltype(nWordCount) nDstOffset = 0;
    3403       98787 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3404       98787 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3405             : 
    3406     5631237 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3407             :     {
    3408     5532445 :         const Tin *const pPixelIn =
    3409     5532445 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3410     5532445 :         Tout *const pPixelOut =
    3411     5532445 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3412             : 
    3413     5532445 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    3414     5532445 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    3415             : 
    3416     5532445 :         nDstOffset += nDstPixelStride;
    3417             :     }
    3418       98787 : }
    3419             : 
    3420             : /************************************************************************/
    3421             : /*                      GDALCopyWordsComplexOutT()                      */
    3422             : /************************************************************************/
    3423             : /**
    3424             :  * Template function, used to copy data from pSrcData into buffer
    3425             :  * pDstData, with stride nSrcPixelStride in the source data and
    3426             :  * stride nDstPixelStride in the destination data. Deals with the
    3427             :  * case where the value is real coming in, but complex going out.
    3428             :  *
    3429             :  * @param pSrcData the source data buffer
    3430             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3431             :  *                      of interest, in bytes.
    3432             :  * @param pDstData the destination buffer.
    3433             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3434             :  *                      interest, in bytes.
    3435             :  * @param nWordCount the total number of pixel words to copy
    3436             :  *
    3437             :  */
    3438             : template <class Tin, class Tout>
    3439        4762 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    3440             :                                      int nSrcPixelStride,
    3441             :                                      Tout *const CPL_RESTRICT pDstData,
    3442             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    3443             : {
    3444        4762 :     decltype(nWordCount) nDstOffset = 0;
    3445             : 
    3446        4762 :     const Tout tOutZero = static_cast<Tout>(0);
    3447             : 
    3448        4762 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3449        4762 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3450             : 
    3451     1190408 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3452             :     {
    3453     1185646 :         const Tin tValue =
    3454     1185646 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3455     1185646 :         Tout *const pPixelOut =
    3456     1185646 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3457     1185646 :         GDALCopyWord(tValue, *pPixelOut);
    3458             : 
    3459     1185646 :         pPixelOut[1] = tOutZero;
    3460             : 
    3461     1185646 :         nDstOffset += nDstPixelStride;
    3462             :     }
    3463        4762 : }
    3464             : 
    3465             : /************************************************************************/
    3466             : /*                         GDALCopyWordsFromT()                         */
    3467             : /************************************************************************/
    3468             : /**
    3469             :  * Template driver function. Given the input type T, call the appropriate
    3470             :  * GDALCopyWordsT function template for the desired output type. You should
    3471             :  * never call this function directly (call GDALCopyWords instead).
    3472             :  *
    3473             :  * @param pSrcData source data buffer
    3474             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    3475             :  * @param bInComplex input is complex
    3476             :  * @param pDstData destination data buffer
    3477             :  * @param eDstType destination data type
    3478             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    3479             :  * @param nWordCount number of pixel words to be copied
    3480             :  */
    3481             : template <class T>
    3482    61300899 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    3483             :                                int nSrcPixelStride, bool bInComplex,
    3484             :                                void *CPL_RESTRICT pDstData,
    3485             :                                GDALDataType eDstType, int nDstPixelStride,
    3486             :                                GPtrDiff_t nWordCount)
    3487             : {
    3488    61300899 :     switch (eDstType)
    3489             :     {
    3490     4799947 :         case GDT_UInt8:
    3491     4799947 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3492             :                            static_cast<unsigned char *>(pDstData),
    3493             :                            nDstPixelStride, nWordCount);
    3494     4799947 :             break;
    3495        1891 :         case GDT_Int8:
    3496        1891 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3497             :                            static_cast<signed char *>(pDstData),
    3498             :                            nDstPixelStride, nWordCount);
    3499        1891 :             break;
    3500     1143524 :         case GDT_UInt16:
    3501     1143524 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3502             :                            static_cast<unsigned short *>(pDstData),
    3503             :                            nDstPixelStride, nWordCount);
    3504     1143524 :             break;
    3505     4162742 :         case GDT_Int16:
    3506     4162742 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3507             :                            static_cast<short *>(pDstData), nDstPixelStride,
    3508             :                            nWordCount);
    3509     4162742 :             break;
    3510       23084 :         case GDT_UInt32:
    3511       23084 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3512             :                            static_cast<unsigned int *>(pDstData),
    3513             :                            nDstPixelStride, nWordCount);
    3514       23084 :             break;
    3515    29449431 :         case GDT_Int32:
    3516    29449431 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3517             :                            static_cast<int *>(pDstData), nDstPixelStride,
    3518             :                            nWordCount);
    3519    29449431 :             break;
    3520        1250 :         case GDT_UInt64:
    3521        1250 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3522             :                            static_cast<std::uint64_t *>(pDstData),
    3523             :                            nDstPixelStride, nWordCount);
    3524        1250 :             break;
    3525        5957 :         case GDT_Int64:
    3526        5957 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3527             :                            static_cast<std::int64_t *>(pDstData),
    3528             :                            nDstPixelStride, nWordCount);
    3529        5957 :             break;
    3530         999 :         case GDT_Float16:
    3531         999 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3532             :                            static_cast<GFloat16 *>(pDstData), nDstPixelStride,
    3533             :                            nWordCount);
    3534         999 :             break;
    3535     4220966 :         case GDT_Float32:
    3536     4220966 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3537             :                            static_cast<float *>(pDstData), nDstPixelStride,
    3538             :                            nWordCount);
    3539     4220966 :             break;
    3540    17387423 :         case GDT_Float64:
    3541    17387423 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3542             :                            static_cast<double *>(pDstData), nDstPixelStride,
    3543             :                            nWordCount);
    3544    17387423 :             break;
    3545       94424 :         case GDT_CInt16:
    3546       94424 :             if (bInComplex)
    3547             :             {
    3548       93170 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3549             :                                       static_cast<short *>(pDstData),
    3550             :                                       nDstPixelStride, nWordCount);
    3551             :             }
    3552             :             else  // input is not complex, so we need to promote to a complex
    3553             :                   // buffer
    3554             :             {
    3555        1254 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3556             :                                          static_cast<short *>(pDstData),
    3557             :                                          nDstPixelStride, nWordCount);
    3558             :             }
    3559       94424 :             break;
    3560        1349 :         case GDT_CInt32:
    3561        1349 :             if (bInComplex)
    3562             :             {
    3563         717 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3564             :                                       static_cast<int *>(pDstData),
    3565             :                                       nDstPixelStride, nWordCount);
    3566             :             }
    3567             :             else  // input is not complex, so we need to promote to a complex
    3568             :                   // buffer
    3569             :             {
    3570         632 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3571             :                                          static_cast<int *>(pDstData),
    3572             :                                          nDstPixelStride, nWordCount);
    3573             :             }
    3574        1349 :             break;
    3575         313 :         case GDT_CFloat16:
    3576         313 :             if (bInComplex)
    3577             :             {
    3578          48 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3579             :                                       static_cast<GFloat16 *>(pDstData),
    3580             :                                       nDstPixelStride, nWordCount);
    3581             :             }
    3582             :             else  // input is not complex, so we need to promote to a complex
    3583             :                   // buffer
    3584             :             {
    3585         265 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3586             :                                          static_cast<GFloat16 *>(pDstData),
    3587             :                                          nDstPixelStride, nWordCount);
    3588             :             }
    3589         313 :             break;
    3590        3923 :         case GDT_CFloat32:
    3591        3923 :             if (bInComplex)
    3592             :             {
    3593        3114 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3594             :                                       static_cast<float *>(pDstData),
    3595             :                                       nDstPixelStride, nWordCount);
    3596             :             }
    3597             :             else  // input is not complex, so we need to promote to a complex
    3598             :                   // buffer
    3599             :             {
    3600         809 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3601             :                                          static_cast<float *>(pDstData),
    3602             :                                          nDstPixelStride, nWordCount);
    3603             :             }
    3604        3923 :             break;
    3605        3540 :         case GDT_CFloat64:
    3606        3540 :             if (bInComplex)
    3607             :             {
    3608        1738 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3609             :                                       static_cast<double *>(pDstData),
    3610             :                                       nDstPixelStride, nWordCount);
    3611             :             }
    3612             :             else  // input is not complex, so we need to promote to a complex
    3613             :                   // buffer
    3614             :             {
    3615        1802 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3616             :                                          static_cast<double *>(pDstData),
    3617             :                                          nDstPixelStride, nWordCount);
    3618             :             }
    3619        3540 :             break;
    3620           0 :         case GDT_Unknown:
    3621             :         case GDT_TypeCount:
    3622           0 :             CPLAssert(false);
    3623             :     }
    3624    61300899 : }
    3625             : 
    3626             : }  // end anonymous namespace
    3627             : 
    3628             : /************************************************************************/
    3629             : /*                         GDALReplicateWord()                          */
    3630             : /************************************************************************/
    3631             : 
    3632             : template <class T>
    3633      600347 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    3634             :                                GPtrDiff_t nWordCount)
    3635             : {
    3636      600347 :     const T valSet = *static_cast<const T *>(pDstData);
    3637      600347 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    3638             :     {
    3639      570534 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    3640    31989919 :         while (nWordCount >= 4)
    3641             :         {
    3642    31419380 :             nWordCount -= 4;
    3643    31419380 :             pDstPtr[0] = valSet;
    3644    31419380 :             pDstPtr[1] = valSet;
    3645    31419380 :             pDstPtr[2] = valSet;
    3646    31419380 :             pDstPtr[3] = valSet;
    3647    31419380 :             pDstPtr += 4;
    3648             :         }
    3649     1476400 :         while (nWordCount > 0)
    3650             :         {
    3651      905866 :             --nWordCount;
    3652      905866 :             *pDstPtr = valSet;
    3653      905866 :             pDstPtr++;
    3654             :         }
    3655             :     }
    3656             :     else
    3657             :     {
    3658       29813 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    3659     1040984 :         while (nWordCount > 0)
    3660             :         {
    3661     1011171 :             --nWordCount;
    3662     1011171 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    3663     1011171 :             pabyDstPtr += nDstPixelStride;
    3664             :         }
    3665             :     }
    3666      600347 : }
    3667             : 
    3668     1067780 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    3669             :                               GDALDataType eSrcType,
    3670             :                               void *CPL_RESTRICT pDstData,
    3671             :                               GDALDataType eDstType, int nDstPixelStride,
    3672             :                               GPtrDiff_t nWordCount)
    3673             : {
    3674             :     /* -----------------------------------------------------------------------
    3675             :      */
    3676             :     /* Special case when the source data is always the same value */
    3677             :     /* (for VRTSourcedRasterBand::IRasterIO and
    3678             :      * VRTDerivedRasterBand::IRasterIO*/
    3679             :     /*  for example) */
    3680             :     /* -----------------------------------------------------------------------
    3681             :      */
    3682             :     // Let the general translation case do the necessary conversions
    3683             :     // on the first destination element.
    3684     1067780 :     GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    3685             : 
    3686             :     // Now copy the first element to the nWordCount - 1 following destination
    3687             :     // elements.
    3688     1067780 :     nWordCount--;
    3689     1067780 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    3690             : 
    3691     1067780 :     switch (eDstType)
    3692             :     {
    3693      467342 :         case GDT_UInt8:
    3694             :         case GDT_Int8:
    3695             :         {
    3696      467342 :             if (nDstPixelStride == 1)
    3697             :             {
    3698      369424 :                 if (nWordCount > 0)
    3699      369424 :                     memset(pabyDstWord,
    3700      369424 :                            *reinterpret_cast<const GByte *>(pDstData),
    3701             :                            nWordCount);
    3702             :             }
    3703             :             else
    3704             :             {
    3705       97918 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    3706    67697100 :                 while (nWordCount > 0)
    3707             :                 {
    3708    67599200 :                     --nWordCount;
    3709    67599200 :                     *pabyDstWord = valSet;
    3710    67599200 :                     pabyDstWord += nDstPixelStride;
    3711             :                 }
    3712             :             }
    3713      467342 :             break;
    3714             :         }
    3715             : 
    3716             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    3717             :     case enum_type:                                                            \
    3718             :     {                                                                          \
    3719             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    3720             :         break;                                                                 \
    3721             :     }
    3722             : 
    3723       34513 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    3724      202461 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    3725          74 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3726      301585 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3727          41 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3728        1072 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3729           2 :             CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
    3730       52827 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3731        7772 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3732             : 
    3733             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3734             :     case enum_type:                                                            \
    3735             :     {                                                                          \
    3736             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3737             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3738             :         while (nWordCount > 0)                                                 \
    3739             :         {                                                                      \
    3740             :             --nWordCount;                                                      \
    3741             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3742             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3743             :             pabyDstWord += nDstPixelStride;                                    \
    3744             :         }                                                                      \
    3745             :         break;                                                                 \
    3746             :     }
    3747             : 
    3748         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3749         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3750           6 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
    3751         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3752         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3753             : 
    3754           0 :         case GDT_Unknown:
    3755             :         case GDT_TypeCount:
    3756           0 :             CPLAssert(false);
    3757             :     }
    3758     1067780 : }
    3759             : 
    3760             : /************************************************************************/
    3761             : /*                          GDALUnrolledCopy()                          */
    3762             : /************************************************************************/
    3763             : 
    3764             : template <class T, int srcStride, int dstStride>
    3765             : #if defined(__GNUC__) && defined(__AVX2__)
    3766             : __attribute__((optimize("tree-vectorize")))
    3767             : #endif
    3768     3000975 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3769             :                                            const T *CPL_RESTRICT pSrc,
    3770             :                                            GPtrDiff_t nIters)
    3771             : {
    3772             : #if !(defined(__GNUC__) && defined(__AVX2__))
    3773     3000975 :     if (nIters >= 16)
    3774             :     {
    3775   132815387 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3776             :         {
    3777   129935045 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3778   129935045 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3779   129935045 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3780   129935045 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3781   129935045 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3782   129935045 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3783   129935045 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3784   129935045 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3785   129935045 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3786   129935045 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3787   129935045 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3788   129935045 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3789   129935045 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3790   129935045 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3791   129935045 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3792   129935045 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3793   129935045 :             pDest += 16 * dstStride;
    3794   129935045 :             pSrc += 16 * srcStride;
    3795             :         }
    3796     2880417 :         nIters = nIters % 16;
    3797             :     }
    3798             : #else
    3799             : #pragma GCC unroll 4
    3800             : #endif
    3801     5162719 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3802             :     {
    3803     2161743 :         pDest[i * dstStride] = *pSrc;
    3804     2161743 :         pSrc += srcStride;
    3805             :     }
    3806     3000975 : }
    3807             : 
    3808             : template <class T, int srcStride, int dstStride>
    3809     3000975 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3810             :                                     const T *CPL_RESTRICT pSrc,
    3811             :                                     GPtrDiff_t nIters)
    3812             : {
    3813     3000975 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3814     3000975 : }
    3815             : 
    3816             : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                \
    3817             :     (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
    3818             : 
    3819             : template <>
    3820             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3821             :                                    const GByte *CPL_RESTRICT pSrc,
    3822             :                                    GPtrDiff_t nIters)
    3823             : {
    3824             :     if (nIters > 16)
    3825             :     {
    3826             :         // The SSSE3 variant is slightly faster than what the gcc autovectorizer
    3827             :         // generates
    3828             :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3829             :     }
    3830             :     else
    3831             :     {
    3832             :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3833             :         {
    3834             :             pDest[i] = *pSrc;
    3835             :             pSrc += 3;
    3836             :         }
    3837             :     }
    3838             : }
    3839             : 
    3840             : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
    3841             : 
    3842             : template <>
    3843      354124 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3844             :                                    const GByte *CPL_RESTRICT pSrc,
    3845             :                                    GPtrDiff_t nIters)
    3846             : {
    3847      354124 :     decltype(nIters) i = 0;
    3848      354124 :     if (nIters > 16)
    3849             :     {
    3850      194667 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3851             :         // If we were sure that there would always be 1 trailing byte, we could
    3852             :         // check against nIters - 15
    3853     2988110 :         for (; i < nIters - 16; i += 16)
    3854             :         {
    3855             :             __m128i xmm0 =
    3856     2793440 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3857             :             __m128i xmm1 =
    3858     5586890 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3859             :             // Set higher 8bit of each int16 packed word to 0
    3860     2793440 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3861     2793440 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3862             :             // Pack int16 to uint8 and merge back both vector
    3863     2793440 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3864             : 
    3865             :             // Store result
    3866     2793440 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3867             : 
    3868     2793440 :             pSrc += 2 * 16;
    3869             :         }
    3870             :     }
    3871     4633170 :     for (; i < nIters; i++)
    3872             :     {
    3873     4279050 :         pDest[i] = *pSrc;
    3874     4279050 :         pSrc += 2;
    3875             :     }
    3876      354124 : }
    3877             : 
    3878           1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
    3879             :                                             const GByte *CPL_RESTRICT pSrc,
    3880             :                                             GPtrDiff_t nIters)
    3881             : {
    3882           1 :     decltype(nIters) i = 0;
    3883           1 :     const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
    3884             :     // If we were sure that there would always be 2 trailing bytes, we could
    3885             :     // check against nIters - 15
    3886           2 :     for (; i < nIters - 16; i += 16)
    3887             :     {
    3888             :         __m128i xmm0 =
    3889           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3890             :         __m128i xmm1 =
    3891           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3892             :         __m128i xmm2 =
    3893           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3894             : 
    3895           1 :         auto xmm_mask0 = xmm_mask_ori;
    3896           1 :         auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
    3897           1 :         auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
    3898             : 
    3899           1 :         auto xmm = _mm_and_si128(xmm0, xmm_mask0);
    3900           1 :         auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
    3901             : 
    3902           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3903           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3904           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3905           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3906           2 :         xmm_res1 = _mm_or_si128(
    3907             :             xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
    3908             : 
    3909           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3910           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3911           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3912           2 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3913           1 :         xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
    3914             : 
    3915           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3916           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3917           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3918           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3919           2 :         xmm_res1 = _mm_or_si128(
    3920             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
    3921             : 
    3922           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3923           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3924           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3925           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3926           3 :         xmm_res1 = _mm_or_si128(
    3927             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
    3928           1 :         xmm = _mm_or_si128(xmm, xmm_res1);
    3929             : 
    3930           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3931           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3932           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3933             : 
    3934           2 :         xmm = _mm_or_si128(xmm,
    3935             :                            _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
    3936             : 
    3937           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3938           2 :         xmm = _mm_or_si128(xmm,
    3939             :                            _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
    3940             : 
    3941           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3942           2 :         xmm = _mm_or_si128(xmm,
    3943             :                            _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
    3944             : 
    3945           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3946           2 :         xmm = _mm_or_si128(xmm,
    3947             :                            _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
    3948             : 
    3949           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    3950           2 :         xmm = _mm_or_si128(xmm,
    3951             :                            _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
    3952             : 
    3953           1 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
    3954             : 
    3955           1 :         pSrc += 3 * 16;
    3956             :     }
    3957           2 :     for (; i < nIters; i++)
    3958             :     {
    3959           1 :         pDest[i] = *pSrc;
    3960           1 :         pSrc += 3;
    3961             :     }
    3962           1 : }
    3963             : 
    3964             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3965             : 
    3966             : template <>
    3967      192265 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3968             :                                    const GByte *CPL_RESTRICT pSrc,
    3969             :                                    GPtrDiff_t nIters)
    3970             : {
    3971      192265 :     if (nIters > 16)
    3972             :     {
    3973      186142 :         if (CPLHaveRuntimeSSSE3())
    3974             :         {
    3975      186141 :             GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3976             :         }
    3977             :         else
    3978             :         {
    3979           1 :             GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    3980             :         }
    3981             :     }
    3982             :     else
    3983             :     {
    3984       20384 :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3985             :         {
    3986       14261 :             pDest[i] = *pSrc;
    3987       14261 :             pSrc += 3;
    3988             :         }
    3989             :     }
    3990      192265 : }
    3991             : 
    3992             : #else
    3993             : 
    3994             : template <>
    3995             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3996             :                                    const GByte *CPL_RESTRICT pSrc,
    3997             :                                    GPtrDiff_t nIters)
    3998             : {
    3999             :     GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    4000             : }
    4001             : #endif
    4002             : 
    4003             : template <>
    4004      332657 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    4005             :                                    const GByte *CPL_RESTRICT pSrc,
    4006             :                                    GPtrDiff_t nIters)
    4007             : {
    4008      332657 :     decltype(nIters) i = 0;
    4009      332657 :     if (nIters > 16)
    4010             :     {
    4011      327364 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    4012             :         // If we were sure that there would always be 3 trailing bytes, we could
    4013             :         // check against nIters - 15
    4014    28043500 :         for (; i < nIters - 16; i += 16)
    4015             :         {
    4016             :             __m128i xmm0 =
    4017    27716100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    4018             :             __m128i xmm1 =
    4019    27716100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    4020             :             __m128i xmm2 =
    4021    27716100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    4022             :             __m128i xmm3 =
    4023    55432200 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    4024             :             // Set higher 24bit of each int32 packed word to 0
    4025    27716100 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    4026    27716100 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    4027    27716100 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    4028    27716100 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    4029             :             // Pack int32 to int16
    4030    27716100 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    4031    27716100 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    4032             :             // Pack int16 to uint8
    4033    27716100 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    4034             : 
    4035             :             // Store result
    4036    27716100 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    4037             : 
    4038    27716100 :             pSrc += 4 * 16;
    4039             :         }
    4040             :     }
    4041     5048740 :     for (; i < nIters; i++)
    4042             :     {
    4043     4716080 :         pDest[i] = *pSrc;
    4044     4716080 :         pSrc += 4;
    4045             :     }
    4046      332657 : }
    4047             : #endif  // HAVE_SSE2
    4048             : 
    4049             : /************************************************************************/
    4050             : /*                            GDALFastCopy()                            */
    4051             : /************************************************************************/
    4052             : 
    4053             : template <class T>
    4054    40098900 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    4055             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    4056             :                                 GPtrDiff_t nIters)
    4057             : {
    4058    40098900 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    4059    40098900 :     if (nIters == 1)
    4060             :     {
    4061    22540680 :         *pDest = *pSrc;
    4062             :     }
    4063    17558245 :     else if (nDestStride == sizeofT)
    4064             :     {
    4065    14484102 :         if (nSrcStride == sizeofT)
    4066             :         {
    4067    13395287 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    4068             :         }
    4069     1088779 :         else if (nSrcStride == 2 * sizeofT)
    4070             :         {
    4071      357339 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    4072             :         }
    4073      731440 :         else if (nSrcStride == 3 * sizeofT)
    4074             :         {
    4075      289245 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    4076             :         }
    4077      442195 :         else if (nSrcStride == 4 * sizeofT)
    4078             :         {
    4079      336639 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    4080             :         }
    4081             :         else
    4082             :         {
    4083    17229290 :             while (nIters-- > 0)
    4084             :             {
    4085    17123750 :                 *pDest = *pSrc;
    4086    17123750 :                 pSrc += nSrcStride / sizeofT;
    4087    17123750 :                 pDest++;
    4088             :             }
    4089             :         }
    4090             :     }
    4091     3074113 :     else if (nSrcStride == sizeofT)
    4092             :     {
    4093     3061117 :         if (nDestStride == 2 * sizeofT)
    4094             :         {
    4095      151252 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    4096             :         }
    4097     2909865 :         else if (nDestStride == 3 * sizeofT)
    4098             :         {
    4099     2131921 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    4100             :         }
    4101      777937 :         else if (nDestStride == 4 * sizeofT)
    4102             :         {
    4103      613625 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    4104             :         }
    4105             :         else
    4106             :         {
    4107    17169660 :             while (nIters-- > 0)
    4108             :             {
    4109    17005410 :                 *pDest = *pSrc;
    4110    17005410 :                 pSrc++;
    4111    17005410 :                 pDest += nDestStride / sizeofT;
    4112             :             }
    4113             :         }
    4114             :     }
    4115             :     else
    4116             :     {
    4117     1220108 :         while (nIters-- > 0)
    4118             :         {
    4119     1207102 :             *pDest = *pSrc;
    4120     1207102 :             pSrc += nSrcStride / sizeofT;
    4121     1207102 :             pDest += nDestStride / sizeofT;
    4122             :         }
    4123             :     }
    4124    40098900 : }
    4125             : 
    4126             : /************************************************************************/
    4127             : /*                          GDALFastCopyByte()                          */
    4128             : /************************************************************************/
    4129             : 
    4130      326250 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    4131             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    4132             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    4133             : {
    4134      326250 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    4135             :                  nWordCount);
    4136      326250 : }
    4137             : 
    4138             : /************************************************************************/
    4139             : /*                           GDALCopyWords()                            */
    4140             : /************************************************************************/
    4141             : 
    4142             : /**
    4143             :  * Copy pixel words from buffer to buffer.
    4144             :  *
    4145             :  * @see GDALCopyWords64()
    4146             :  */
    4147    80502400 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    4148             :                                GDALDataType eSrcType, int nSrcPixelStride,
    4149             :                                void *CPL_RESTRICT pDstData,
    4150             :                                GDALDataType eDstType, int nDstPixelStride,
    4151             :                                int nWordCount)
    4152             : {
    4153    80502400 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    4154             :                     nDstPixelStride, nWordCount);
    4155    80502400 : }
    4156             : 
    4157             : /************************************************************************/
    4158             : /*                          GDALCopyWords64()                           */
    4159             : /************************************************************************/
    4160             : 
    4161             : /**
    4162             :  * Copy pixel words from buffer to buffer.
    4163             :  *
    4164             :  * This function is used to copy pixel word values from one memory buffer
    4165             :  * to another, with support for conversion between data types, and differing
    4166             :  * step factors. The data type conversion is done using the following
    4167             :  * rules:
    4168             :  * <ul>
    4169             :  * <li>Values assigned to a lower range integer type are clipped. For
    4170             :  * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
    4171             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    4172             :  * </li>
    4173             :  * <li>
    4174             :  * Assignment from floating point to integer rounds to closest integer.
    4175             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    4176             :  * smallest integer. NaN is mapped to 0.
    4177             :  * </li>
    4178             :  * <li>
    4179             :  * Assignment from non-complex to complex will result in the imaginary part
    4180             :  * being set to zero on output.
    4181             :  * </li>
    4182             :  * <li> Assignment from complex to
    4183             :  * non-complex will result in the complex portion being lost and the real
    4184             :  * component being preserved (<i>not magnitude!</i>).
    4185             :  * </li>
    4186             :  * </ul>
    4187             :  *
    4188             :  * No assumptions are made about the source or destination words occurring
    4189             :  * on word boundaries.  It is assumed that all values are in native machine
    4190             :  * byte order.
    4191             :  *
    4192             :  * @param pSrcData Pointer to source data to be converted.
    4193             :  * @param eSrcType the source data type (see GDALDataType enum)
    4194             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    4195             :  * in bytes
    4196             :  * @param pDstData Pointer to buffer where destination data should go
    4197             :  * @param eDstType the destination data type (see GDALDataType enum)
    4198             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    4199             :  * words), in bytes
    4200             :  * @param nWordCount number of words to be copied
    4201             :  *
    4202             :  * @note
    4203             :  * When adding a new data type to GDAL, you must do the following to
    4204             :  * support it properly within the GDALCopyWords function:
    4205             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    4206             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    4207             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    4208             :  *    This should call the appropriate GDALCopyWordsT template.
    4209             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    4210             :  *    above namespace. This will ensure that any conversion issues are
    4211             :  *    handled (cases like the float -> int32 case, where the min/max)
    4212             :  *    values are subject to roundoff error.
    4213             :  */
    4214             : 
    4215   116786000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    4216             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    4217             :                                  void *CPL_RESTRICT pDstData,
    4218             :                                  GDALDataType eDstType, int nDstPixelStride,
    4219             :                                  GPtrDiff_t nWordCount)
    4220             : 
    4221             : {
    4222             :     // On platforms where alignment matters, be careful
    4223   116786000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    4224   116786000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    4225   116786000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    4226             :     {
    4227           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    4228             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    4229             :                  "argument");
    4230           2 :         return;
    4231             :     }
    4232   116786000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    4233    66330800 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    4234    66330800 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    4235    66330400 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    4236    66330400 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    4237             :     {
    4238         905 :         if (eSrcType == eDstType)
    4239             :         {
    4240       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4241             :             {
    4242       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4243             :                        static_cast<const GByte *>(pSrcData) +
    4244       34000 :                            nSrcPixelStride * i,
    4245             :                        nDstDataTypeSize);
    4246             :             }
    4247             :         }
    4248             :         else
    4249             :         {
    4250         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    4251             :             {
    4252             :                 return ptr +
    4253         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    4254         210 :                         align);
    4255             :             };
    4256             : 
    4257             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    4258             :             // be sure to get correctly aligned pointer.
    4259         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    4260             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    4261             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    4262             :             GByte *pabySrcBuffer =
    4263         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    4264             :             GByte *pabyDstBuffer =
    4265         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    4266        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4267             :             {
    4268        3255 :                 memcpy(pabySrcBuffer,
    4269             :                        static_cast<const GByte *>(pSrcData) +
    4270        3255 :                            nSrcPixelStride * i,
    4271             :                        nSrcDataTypeSize);
    4272        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    4273             :                                 eDstType, 0, 1);
    4274        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4275             :                        pabyDstBuffer, nDstDataTypeSize);
    4276             :             }
    4277             :         }
    4278         905 :         return;
    4279             :     }
    4280             : 
    4281             :     // Deal with the case where we're replicating a single word into the
    4282             :     // provided buffer
    4283   116785000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    4284             :     {
    4285     1067780 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    4286             :                           nDstPixelStride, nWordCount);
    4287     1067780 :         return;
    4288             :     }
    4289             : 
    4290   115717000 :     if (eSrcType == eDstType)
    4291             :     {
    4292    54678100 :         if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
    4293             :         {
    4294    17976000 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    4295             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    4296             :                          nWordCount);
    4297    17976000 :             return;
    4298             :         }
    4299             : 
    4300    36702100 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    4301    21796600 :             (nDstPixelStride % 2) == 0)
    4302             :         {
    4303    21796600 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    4304             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    4305             :                          nWordCount);
    4306    21796600 :             return;
    4307             :         }
    4308             : 
    4309    14905500 :         if (nWordCount == 1)
    4310             :         {
    4311             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    4312             :             // Avoid false positives...
    4313             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    4314             : #else
    4315    14418400 :             if (nSrcDataTypeSize == 2)
    4316           0 :                 memcpy(pDstData, pSrcData, 2);
    4317    14418400 :             else if (nSrcDataTypeSize == 4)
    4318    13814200 :                 memcpy(pDstData, pSrcData, 4);
    4319      604143 :             else if (nSrcDataTypeSize == 8)
    4320      587538 :                 memcpy(pDstData, pSrcData, 8);
    4321             :             else /* if( eSrcType == GDT_CFloat64 ) */
    4322       16605 :                 memcpy(pDstData, pSrcData, 16);
    4323             : #endif
    4324    14418400 :             return;
    4325             :         }
    4326             : 
    4327             :         // Let memcpy() handle the case where we're copying a packed buffer
    4328             :         // of pixels.
    4329      487079 :         if (nSrcPixelStride == nDstPixelStride)
    4330             :         {
    4331      225235 :             if (nSrcPixelStride == nSrcDataTypeSize)
    4332             :             {
    4333      225167 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    4334      225167 :                 return;
    4335             :             }
    4336             :         }
    4337             :     }
    4338             : 
    4339             :     // Handle the more general case -- deals with conversion of data types
    4340             :     // directly.
    4341    61300800 :     switch (eSrcType)
    4342             :     {
    4343    20311000 :         case GDT_UInt8:
    4344    20311000 :             GDALCopyWordsFromT<unsigned char>(
    4345             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    4346             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4347    20311000 :             break;
    4348        1802 :         case GDT_Int8:
    4349        1802 :             GDALCopyWordsFromT<signed char>(
    4350             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    4351             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4352        1802 :             break;
    4353       54651 :         case GDT_UInt16:
    4354       54651 :             GDALCopyWordsFromT<unsigned short>(
    4355             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    4356             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4357       54651 :             break;
    4358     6519570 :         case GDT_Int16:
    4359     6519570 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4360             :                                       nSrcPixelStride, false, pDstData,
    4361             :                                       eDstType, nDstPixelStride, nWordCount);
    4362     6519570 :             break;
    4363        8016 :         case GDT_UInt32:
    4364        8016 :             GDALCopyWordsFromT<unsigned int>(
    4365             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    4366             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4367        8016 :             break;
    4368    12255600 :         case GDT_Int32:
    4369    12255600 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4370             :                                     nSrcPixelStride, false, pDstData, eDstType,
    4371             :                                     nDstPixelStride, nWordCount);
    4372    12255600 :             break;
    4373        2205 :         case GDT_UInt64:
    4374        2205 :             GDALCopyWordsFromT<std::uint64_t>(
    4375             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    4376             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4377        2205 :             break;
    4378       11729 :         case GDT_Int64:
    4379       11729 :             GDALCopyWordsFromT<std::int64_t>(
    4380             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    4381             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4382       11729 :             break;
    4383        1387 :         case GDT_Float16:
    4384        1387 :             GDALCopyWordsFromT<GFloat16>(
    4385             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
    4386             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4387        1387 :             break;
    4388      658514 :         case GDT_Float32:
    4389      658514 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4390             :                                       nSrcPixelStride, false, pDstData,
    4391             :                                       eDstType, nDstPixelStride, nWordCount);
    4392      658514 :             break;
    4393    20715700 :         case GDT_Float64:
    4394    20715700 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4395             :                                        nSrcPixelStride, false, pDstData,
    4396             :                                        eDstType, nDstPixelStride, nWordCount);
    4397    20715700 :             break;
    4398      478485 :         case GDT_CInt16:
    4399      478485 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4400             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4401             :                                       nDstPixelStride, nWordCount);
    4402      478485 :             break;
    4403         868 :         case GDT_CInt32:
    4404         868 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4405             :                                     nSrcPixelStride, true, pDstData, eDstType,
    4406             :                                     nDstPixelStride, nWordCount);
    4407         868 :             break;
    4408         508 :         case GDT_CFloat16:
    4409         508 :             GDALCopyWordsFromT<GFloat16>(
    4410             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
    4411             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4412         508 :             break;
    4413        2437 :         case GDT_CFloat32:
    4414        2437 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4415             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4416             :                                       nDstPixelStride, nWordCount);
    4417        2437 :             break;
    4418      278404 :         case GDT_CFloat64:
    4419      278404 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4420             :                                        nSrcPixelStride, true, pDstData,
    4421             :                                        eDstType, nDstPixelStride, nWordCount);
    4422      278404 :             break;
    4423           0 :         case GDT_Unknown:
    4424             :         case GDT_TypeCount:
    4425           0 :             CPLAssert(false);
    4426             :     }
    4427             : }
    4428             : 
    4429             : /************************************************************************/
    4430             : /*                            GDALCopyBits()                            */
    4431             : /************************************************************************/
    4432             : 
    4433             : /**
    4434             :  * Bitwise word copying.
    4435             :  *
    4436             :  * A function for moving sets of partial bytes around.  Loosely
    4437             :  * speaking this is a bitwise analog to GDALCopyWords().
    4438             :  *
    4439             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    4440             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    4441             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    4442             :  * and nDstOffset are the offset into the source and destination buffers
    4443             :  * to start at, also measured in bits.
    4444             :  *
    4445             :  * All bit offsets are assumed to start from the high order bit in a byte
    4446             :  * (i.e. most significant bit first).  Currently this function is not very
    4447             :  * optimized, but it may be improved for some common cases in the future
    4448             :  * as needed.
    4449             :  *
    4450             :  * @param pabySrcData the source data buffer.
    4451             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    4452             :  * first word to copy.
    4453             :  * @param nSrcStep the offset in bits from the start one source word to the
    4454             :  * start of the next.
    4455             :  * @param pabyDstData the destination data buffer.
    4456             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    4457             :  * first word to copy over.
    4458             :  * @param nDstStep the offset in bits from the start one word to the
    4459             :  * start of the next.
    4460             :  * @param nBitCount the number of bits in a word to be copied.
    4461             :  * @param nStepCount the number of words to copy.
    4462             :  */
    4463             : 
    4464           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    4465             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    4466             :                   int nBitCount, int nStepCount)
    4467             : 
    4468             : {
    4469           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    4470             : 
    4471           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    4472             :     {
    4473           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    4474             :         {
    4475           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    4476           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    4477             :             else
    4478           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    4479             : 
    4480           0 :             nSrcOffset++;
    4481           0 :             nDstOffset++;
    4482             :         }
    4483             : 
    4484           0 :         nSrcOffset += (nSrcStep - nBitCount);
    4485           0 :         nDstOffset += (nDstStep - nBitCount);
    4486             :     }
    4487             : }
    4488             : 
    4489             : /************************************************************************/
    4490             : /*                    GDALGetBestOverviewLevel()                        */
    4491             : /*                                                                      */
    4492             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4493             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    4494             : /* returning a valid overview level                                     */
    4495             : /************************************************************************/
    4496             : 
    4497           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    4498             :                                  int &nXSize, int &nYSize, int nBufXSize,
    4499             :                                  int nBufYSize)
    4500             : {
    4501           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    4502           0 :                                          nBufXSize, nBufYSize, nullptr);
    4503             : }
    4504             : 
    4505      524002 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    4506             :                                   int &nYOff, int &nXSize, int &nYSize,
    4507             :                                   int nBufXSize, int nBufYSize,
    4508             :                                   GDALRasterIOExtraArg *psExtraArg)
    4509             : {
    4510      524002 :     if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
    4511      524002 :         psExtraArg->bUseOnlyThisScale)
    4512         109 :         return -1;
    4513             :     /* -------------------------------------------------------------------- */
    4514             :     /*      Compute the desired downsampling factor.  It is                 */
    4515             :     /*      based on the least reduced axis, and represents the number      */
    4516             :     /*      of source pixels to one destination pixel.                      */
    4517             :     /* -------------------------------------------------------------------- */
    4518      523893 :     const double dfDesiredDownsamplingFactor =
    4519      523893 :         ((nXSize / static_cast<double>(nBufXSize)) <
    4520      361553 :              (nYSize / static_cast<double>(nBufYSize)) ||
    4521             :          nBufYSize == 1)
    4522      752282 :             ? nXSize / static_cast<double>(nBufXSize)
    4523      133164 :             : nYSize / static_cast<double>(nBufYSize);
    4524             : 
    4525             :     /* -------------------------------------------------------------------- */
    4526             :     /*      Find the overview level that largest downsampling factor (most  */
    4527             :     /*      downsampled) that is still less than (or only a little more)    */
    4528             :     /*      downsampled than the request.                                   */
    4529             :     /* -------------------------------------------------------------------- */
    4530      523893 :     const int nOverviewCount = poBand->GetOverviewCount();
    4531      523893 :     GDALRasterBand *poBestOverview = nullptr;
    4532      523893 :     double dfBestDownsamplingFactor = 0;
    4533      523893 :     int nBestOverviewLevel = -1;
    4534             : 
    4535             :     const char *pszOversampligThreshold =
    4536      523893 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    4537             : 
    4538             :     // Note: keep this logic for overview selection in sync between
    4539             :     // gdalwarp_lib.cpp and rasterio.cpp
    4540             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    4541             :     const double dfOversamplingThreshold =
    4542     1047780 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    4543      523884 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    4544     1047770 :             ? 1.0
    4545      523893 :             : 1.2;
    4546      526589 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4547             :     {
    4548        5614 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    4549       11228 :         if (poOverview == nullptr ||
    4550       11227 :             poOverview->GetXSize() > poBand->GetXSize() ||
    4551        5613 :             poOverview->GetYSize() > poBand->GetYSize())
    4552             :         {
    4553           1 :             continue;
    4554             :         }
    4555             : 
    4556             :         // Compute downsampling factor of this overview
    4557             :         const double dfDownsamplingFactor = std::min(
    4558        5613 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    4559       11226 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    4560             : 
    4561             :         // Is it nearly the requested factor and better (lower) than
    4562             :         // the current best factor?
    4563             :         // Use an epsilon because of numerical instability.
    4564        5613 :         constexpr double EPSILON = 1e-1;
    4565        5721 :         if (dfDownsamplingFactor >=
    4566        5613 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold +
    4567        5505 :                     EPSILON ||
    4568             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    4569             :         {
    4570         108 :             continue;
    4571             :         }
    4572             : 
    4573             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    4574        5505 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    4575             : 
    4576        5505 :         if (pszResampling != nullptr &&
    4577          71 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    4578          16 :             continue;
    4579             : 
    4580             :         // OK, this is our new best overview.
    4581        5489 :         poBestOverview = poOverview;
    4582        5489 :         nBestOverviewLevel = iOverview;
    4583        5489 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    4584             : 
    4585        5489 :         if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
    4586             :             EPSILON)
    4587             :         {
    4588        2918 :             break;
    4589             :         }
    4590             :     }
    4591             : 
    4592             :     /* -------------------------------------------------------------------- */
    4593             :     /*      If we didn't find an overview that helps us, just return        */
    4594             :     /*      indicating failure and the full resolution image will be used.  */
    4595             :     /* -------------------------------------------------------------------- */
    4596      523893 :     if (nBestOverviewLevel < 0)
    4597      520902 :         return -1;
    4598             : 
    4599             :     /* -------------------------------------------------------------------- */
    4600             :     /*      Recompute the source window in terms of the selected            */
    4601             :     /*      overview.                                                       */
    4602             :     /* -------------------------------------------------------------------- */
    4603             :     const double dfXFactor =
    4604        2991 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    4605             :     const double dfYFactor =
    4606        2991 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    4607        2991 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    4608             :              poBestOverview->GetYSize());
    4609             : 
    4610        8973 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    4611        2991 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    4612        8973 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    4613        2991 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    4614        2991 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    4615        2991 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    4616        2991 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    4617           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    4618        2991 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    4619           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    4620             : 
    4621        2991 :     if (psExtraArg)
    4622             :     {
    4623        2991 :         if (psExtraArg->bFloatingPointWindowValidity)
    4624             :         {
    4625         117 :             psExtraArg->dfXOff /= dfXFactor;
    4626         117 :             psExtraArg->dfXSize /= dfXFactor;
    4627         117 :             psExtraArg->dfYOff /= dfYFactor;
    4628         117 :             psExtraArg->dfYSize /= dfYFactor;
    4629             :         }
    4630        2874 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    4631             :         {
    4632          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    4633          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    4634          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    4635          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    4636          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    4637             :         }
    4638             :     }
    4639             : 
    4640        2991 :     nXOff = nOXOff;
    4641        2991 :     nYOff = nOYOff;
    4642        2991 :     nXSize = nOXSize;
    4643        2991 :     nYSize = nOYSize;
    4644             : 
    4645        2991 :     return nBestOverviewLevel;
    4646             : }
    4647             : 
    4648             : /************************************************************************/
    4649             : /*                          OverviewRasterIO()                          */
    4650             : /*                                                                      */
    4651             : /*      Special work function to utilize available overviews to         */
    4652             : /*      more efficiently satisfy downsampled requests.  It will         */
    4653             : /*      return CE_Failure if there are no appropriate overviews         */
    4654             : /*      available but it doesn't emit any error messages.               */
    4655             : /************************************************************************/
    4656             : 
    4657             : //! @cond Doxygen_Suppress
    4658           2 : CPLErr GDALRasterBand::OverviewRasterIO(
    4659             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4660             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4661             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    4662             : 
    4663             : {
    4664             :     GDALRasterIOExtraArg sExtraArg;
    4665           2 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4666             : 
    4667           2 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    4668             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    4669           2 :     if (nOverview < 0)
    4670           1 :         return CE_Failure;
    4671             : 
    4672             :     /* -------------------------------------------------------------------- */
    4673             :     /*      Recast the call in terms of the new raster layer.               */
    4674             :     /* -------------------------------------------------------------------- */
    4675           1 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    4676           1 :     if (poOverviewBand == nullptr)
    4677           0 :         return CE_Failure;
    4678             : 
    4679           1 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4680             :                                     pData, nBufXSize, nBufYSize, eBufType,
    4681           1 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    4682             : }
    4683             : 
    4684             : /************************************************************************/
    4685             : /*                        TryOverviewRasterIO()                         */
    4686             : /************************************************************************/
    4687             : 
    4688      362420 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    4689             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4690             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4691             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    4692             :     int *pbTried)
    4693             : {
    4694      362420 :     int nXOffMod = nXOff;
    4695      362420 :     int nYOffMod = nYOff;
    4696      362420 :     int nXSizeMod = nXSize;
    4697      362420 :     int nYSizeMod = nYSize;
    4698             :     GDALRasterIOExtraArg sExtraArg;
    4699             : 
    4700      362420 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4701             : 
    4702      362420 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4703             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    4704             :         &sExtraArg);
    4705             : 
    4706      362420 :     if (iOvrLevel >= 0)
    4707             :     {
    4708          52 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    4709          52 :         if (poOverviewBand)
    4710             :         {
    4711          52 :             *pbTried = TRUE;
    4712          52 :             return poOverviewBand->RasterIO(
    4713             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    4714             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    4715          52 :                 &sExtraArg);
    4716             :         }
    4717             :     }
    4718             : 
    4719      362368 :     *pbTried = FALSE;
    4720      362368 :     return CE_None;
    4721             : }
    4722             : 
    4723             : /************************************************************************/
    4724             : /*                        TryOverviewRasterIO()                         */
    4725             : /************************************************************************/
    4726             : 
    4727      158606 : CPLErr GDALDataset::TryOverviewRasterIO(
    4728             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4729             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4730             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4731             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    4732             :     int *pbTried)
    4733             : {
    4734      158606 :     int nXOffMod = nXOff;
    4735      158606 :     int nYOffMod = nYOff;
    4736      158606 :     int nXSizeMod = nXSize;
    4737      158606 :     int nYSizeMod = nYSize;
    4738             :     GDALRasterIOExtraArg sExtraArg;
    4739      158606 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4740             : 
    4741      317212 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4742      158606 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    4743             :         nBufYSize, &sExtraArg);
    4744             : 
    4745      158647 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    4746          41 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    4747             :     {
    4748          41 :         *pbTried = TRUE;
    4749          41 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    4750             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    4751             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    4752          41 :             nLineSpace, nBandSpace, &sExtraArg);
    4753             :     }
    4754             :     else
    4755             :     {
    4756      158565 :         *pbTried = FALSE;
    4757      158565 :         return CE_None;
    4758             :     }
    4759             : }
    4760             : 
    4761             : /************************************************************************/
    4762             : /*                        GetBestOverviewLevel()                        */
    4763             : /*                                                                      */
    4764             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4765             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    4766             : /* overview level                                                       */
    4767             : /************************************************************************/
    4768             : 
    4769           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    4770             :                                            int &nYOff, int &nXSize, int &nYSize,
    4771             :                                            int nBufXSize, int nBufYSize,
    4772             :                                            int nBandCount,
    4773             :                                            const int *panBandMap,
    4774             :                                            GDALRasterIOExtraArg *psExtraArg)
    4775             : {
    4776           4 :     int nOverviewCount = 0;
    4777           4 :     GDALRasterBand *poFirstBand = nullptr;
    4778             : 
    4779             :     /* -------------------------------------------------------------------- */
    4780             :     /* Check that all bands have the same number of overviews and           */
    4781             :     /* that they have all the same size and block dimensions                */
    4782             :     /* -------------------------------------------------------------------- */
    4783          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4784             :     {
    4785           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    4786           8 :         if (poBand == nullptr)
    4787           0 :             return -1;
    4788           8 :         if (iBand == 0)
    4789             :         {
    4790           4 :             poFirstBand = poBand;
    4791           4 :             nOverviewCount = poBand->GetOverviewCount();
    4792             :         }
    4793           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    4794             :         {
    4795           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4796             :                              "mismatched overview count, use std method.");
    4797           0 :             return -1;
    4798             :         }
    4799             :         else
    4800             :         {
    4801           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4802             :             {
    4803           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    4804             :                 GDALRasterBand *poOvrFirstBand =
    4805           0 :                     poFirstBand->GetOverview(iOverview);
    4806           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    4807           0 :                     continue;
    4808             : 
    4809           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    4810           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    4811             :                 {
    4812           0 :                     CPLDebug("GDAL",
    4813             :                              "GDALDataset::GetBestOverviewLevel() ... "
    4814             :                              "mismatched overview sizes, use std method.");
    4815           0 :                     return -1;
    4816             :                 }
    4817           0 :                 int nBlockXSizeFirst = 0;
    4818           0 :                 int nBlockYSizeFirst = 0;
    4819           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    4820             :                                              &nBlockYSizeFirst);
    4821             : 
    4822           0 :                 int nBlockXSizeCurrent = 0;
    4823           0 :                 int nBlockYSizeCurrent = 0;
    4824           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    4825             :                                         &nBlockYSizeCurrent);
    4826             : 
    4827           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    4828           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    4829             :                 {
    4830           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4831             :                                      "mismatched block sizes, use std method.");
    4832           0 :                     return -1;
    4833             :                 }
    4834             :             }
    4835             :         }
    4836             :     }
    4837           4 :     if (poFirstBand == nullptr)
    4838           0 :         return -1;
    4839             : 
    4840           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    4841             :                                          nYSize, nBufXSize, nBufYSize,
    4842           4 :                                          psExtraArg);
    4843             : }
    4844             : 
    4845             : /************************************************************************/
    4846             : /*                         BlockBasedRasterIO()                         */
    4847             : /*                                                                      */
    4848             : /*      This convenience function implements a dataset level            */
    4849             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    4850             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    4851             : /*      all bands at once, so that a format driver that handles a       */
    4852             : /*      request for different bands of the same block efficiently       */
    4853             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    4854             : /*                                                                      */
    4855             : /*      This method is intended to be called by an overridden           */
    4856             : /*      IRasterIO() method in the driver specific GDALDataset           */
    4857             : /*      derived class.                                                  */
    4858             : /*                                                                      */
    4859             : /*      Default internal implementation of RasterIO() ... utilizes      */
    4860             : /*      the Block access methods to satisfy the request.  This would    */
    4861             : /*      normally only be overridden by formats with overviews.          */
    4862             : /*                                                                      */
    4863             : /*      To keep things relatively simple, this method does not          */
    4864             : /*      currently take advantage of some special cases addressed in     */
    4865             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    4866             : /*      call it when you know it will help.  That is in cases where     */
    4867             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    4868             : /*      implementing interleaved IO efficiently on a block by block     */
    4869             : /*      basis. Overviews will be used when possible.                    */
    4870             : /************************************************************************/
    4871             : 
    4872       64982 : CPLErr GDALDataset::BlockBasedRasterIO(
    4873             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4874             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4875             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4876             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    4877             : 
    4878             : {
    4879       64982 :     CPLAssert(nullptr != pData);
    4880             : 
    4881       64982 :     GByte **papabySrcBlock = nullptr;
    4882       64982 :     GDALRasterBlock *poBlock = nullptr;
    4883       64982 :     GDALRasterBlock **papoBlocks = nullptr;
    4884       64982 :     int nLBlockX = -1;
    4885       64982 :     int nLBlockY = -1;
    4886             :     int iBufYOff;
    4887             :     int iBufXOff;
    4888       64982 :     int nBlockXSize = 1;
    4889       64982 :     int nBlockYSize = 1;
    4890       64982 :     CPLErr eErr = CE_None;
    4891       64982 :     GDALDataType eDataType = GDT_UInt8;
    4892             : 
    4893       64982 :     const bool bUseIntegerRequestCoords =
    4894       65020 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4895          38 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4896          36 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4897             : 
    4898             :     /* -------------------------------------------------------------------- */
    4899             :     /*      Ensure that all bands share a common block size and data type.  */
    4900             :     /* -------------------------------------------------------------------- */
    4901      308187 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4902             :     {
    4903      243205 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4904             : 
    4905      243205 :         if (iBand == 0)
    4906             :         {
    4907       64982 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4908       64982 :             eDataType = poBand->GetRasterDataType();
    4909             :         }
    4910             :         else
    4911             :         {
    4912      178223 :             int nThisBlockXSize = 0;
    4913      178223 :             int nThisBlockYSize = 0;
    4914      178223 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    4915      178223 :             if (nThisBlockXSize != nBlockXSize ||
    4916      178223 :                 nThisBlockYSize != nBlockYSize)
    4917             :             {
    4918           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4919             :                                  "mismatched block sizes, use std method.");
    4920           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4921             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4922             :                                          nBandCount, panBandMap, nPixelSpace,
    4923           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4924             :             }
    4925             : 
    4926      178223 :             if (eDataType != poBand->GetRasterDataType() &&
    4927           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    4928             :             {
    4929           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4930             :                                  "mismatched band data types, use std method.");
    4931           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4932             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4933             :                                          nBandCount, panBandMap, nPixelSpace,
    4934           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4935             :             }
    4936             :         }
    4937             :     }
    4938             : 
    4939             :     /* ==================================================================== */
    4940             :     /*      In this special case at full resolution we step through in      */
    4941             :     /*      blocks, turning the request over to the per-band                */
    4942             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4943             :     /*      called before proceeding to the next.                           */
    4944             :     /* ==================================================================== */
    4945             : 
    4946       64982 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4947             :     {
    4948             :         GDALRasterIOExtraArg sDummyExtraArg;
    4949       64978 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4950             : 
    4951       64978 :         int nChunkYSize = 0;
    4952       64978 :         int nChunkXSize = 0;
    4953             : 
    4954      213434 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4955             :         {
    4956      149472 :             const int nChunkYOff = iBufYOff + nYOff;
    4957      149472 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4958      149472 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4959       59977 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4960             : 
    4961      822752 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4962             :             {
    4963      674295 :                 const int nChunkXOff = iBufXOff + nXOff;
    4964      674295 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4965      674295 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4966       70691 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4967             : 
    4968      674295 :                 GByte *pabyChunkData =
    4969      674295 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4970      674295 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4971             : 
    4972     3282490 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4973             :                 {
    4974     2609210 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4975             : 
    4976     5218420 :                     eErr = poBand->IRasterIO(
    4977             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4978             :                         nChunkYSize,
    4979     2609210 :                         pabyChunkData +
    4980     2609210 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4981             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4982     2609210 :                         nLineSpace, &sDummyExtraArg);
    4983     2609210 :                     if (eErr != CE_None)
    4984        1015 :                         return eErr;
    4985             :                 }
    4986             :             }
    4987             : 
    4988      167371 :             if (psExtraArg->pfnProgress != nullptr &&
    4989       18914 :                 !psExtraArg->pfnProgress(
    4990      167371 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4991             :                         nBufYSize,
    4992             :                     "", psExtraArg->pProgressData))
    4993             :             {
    4994           1 :                 return CE_Failure;
    4995             :             }
    4996             :         }
    4997             : 
    4998       63962 :         return CE_None;
    4999             :     }
    5000             : 
    5001             :     /* Below code is not compatible with that case. It would need a complete */
    5002             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    5003           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    5004             :     {
    5005           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    5006             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    5007             :                                  panBandMap, nPixelSpace, nLineSpace,
    5008           0 :                                  nBandSpace, psExtraArg);
    5009             :     }
    5010             : 
    5011             :     /* We could have a smarter implementation, but that will do for now */
    5012           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    5013           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    5014             :     {
    5015           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    5016             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    5017             :                                  panBandMap, nPixelSpace, nLineSpace,
    5018           0 :                                  nBandSpace, psExtraArg);
    5019             :     }
    5020             : 
    5021             :     /* ==================================================================== */
    5022             :     /*      Loop reading required source blocks to satisfy output           */
    5023             :     /*      request.  This is the most general implementation.              */
    5024             :     /* ==================================================================== */
    5025             : 
    5026           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    5027             : 
    5028             :     papabySrcBlock =
    5029           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    5030             :     papoBlocks =
    5031           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    5032             : 
    5033             :     /* -------------------------------------------------------------------- */
    5034             :     /*      Select an overview level if appropriate.                        */
    5035             :     /* -------------------------------------------------------------------- */
    5036             : 
    5037             :     GDALRasterIOExtraArg sExtraArg;
    5038           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    5039           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    5040             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    5041             :         panBandMap, &sExtraArg);
    5042           4 :     if (nOverviewLevel >= 0)
    5043             :     {
    5044           2 :         GetRasterBand(panBandMap[0])
    5045           2 :             ->GetOverview(nOverviewLevel)
    5046           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    5047             :     }
    5048             : 
    5049           4 :     double dfXOff = nXOff;
    5050           4 :     double dfYOff = nYOff;
    5051           4 :     double dfXSize = nXSize;
    5052           4 :     double dfYSize = nYSize;
    5053           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    5054             :     {
    5055           2 :         dfXOff = sExtraArg.dfXOff;
    5056           2 :         dfYOff = sExtraArg.dfYOff;
    5057           2 :         dfXSize = sExtraArg.dfXSize;
    5058           2 :         dfYSize = sExtraArg.dfYSize;
    5059             :     }
    5060             : 
    5061             :     /* -------------------------------------------------------------------- */
    5062             :     /*      Compute stepping increment.                                     */
    5063             :     /* -------------------------------------------------------------------- */
    5064           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    5065           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    5066             : 
    5067           4 :     constexpr double EPS = 1e-10;
    5068             :     /* -------------------------------------------------------------------- */
    5069             :     /*      Loop over buffer computing source locations.                    */
    5070             :     /* -------------------------------------------------------------------- */
    5071          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    5072             :     {
    5073             :         GPtrDiff_t iSrcOffset;
    5074             : 
    5075             :         // Add small epsilon to avoid some numeric precision issues.
    5076          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    5077          32 :         const int iSrcY = static_cast<int>(std::min(
    5078          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    5079             : 
    5080          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    5081             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    5082             : 
    5083         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    5084             :         {
    5085         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    5086         270 :             const int iSrcX = static_cast<int>(std::min(
    5087         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    5088             : 
    5089             :             // FIXME: this code likely doesn't work if the dirty block gets
    5090             :             // flushed to disk before being completely written. In the meantime,
    5091             :             // bJustInitialize should probably be set to FALSE even if it is not
    5092             :             // ideal performance wise, and for lossy compression
    5093             : 
    5094             :             /* --------------------------------------------------------------------
    5095             :              */
    5096             :             /*      Ensure we have the appropriate block loaded. */
    5097             :             /* --------------------------------------------------------------------
    5098             :              */
    5099         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    5100         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    5101         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    5102         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    5103             :             {
    5104           4 :                 nLBlockX = iSrcX / nBlockXSize;
    5105           4 :                 nLBlockY = iSrcY / nBlockYSize;
    5106             : 
    5107           4 :                 const bool bJustInitialize =
    5108           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    5109           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    5110           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    5111           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    5112             :                 /*bool bMemZeroBuffer = FALSE;
    5113             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    5114             :                     nXOff <= nLBlockX * nBlockXSize &&
    5115             :                     nYOff <= nLBlockY * nBlockYSize &&
    5116             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    5117             :                      (nXOff + nXSize == GetRasterXSize() &&
    5118             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    5119             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    5120             :                      (nYOff + nYSize == GetRasterYSize() &&
    5121             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    5122             :                 {
    5123             :                     bJustInitialize = TRUE;
    5124             :                     bMemZeroBuffer = TRUE;
    5125             :                 }*/
    5126          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    5127             :                 {
    5128           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    5129           8 :                     if (nOverviewLevel >= 0)
    5130           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    5131          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    5132           8 :                                                         bJustInitialize);
    5133           8 :                     if (poBlock == nullptr)
    5134             :                     {
    5135           0 :                         eErr = CE_Failure;
    5136           0 :                         goto CleanupAndReturn;
    5137             :                     }
    5138             : 
    5139           8 :                     if (eRWFlag == GF_Write)
    5140           0 :                         poBlock->MarkDirty();
    5141             : 
    5142           8 :                     if (papoBlocks[iBand] != nullptr)
    5143           0 :                         papoBlocks[iBand]->DropLock();
    5144             : 
    5145           8 :                     papoBlocks[iBand] = poBlock;
    5146             : 
    5147           8 :                     papabySrcBlock[iBand] =
    5148           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    5149             :                     /*if( bMemZeroBuffer )
    5150             :                     {
    5151             :                         memset(papabySrcBlock[iBand], 0,
    5152             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    5153             :                     * nBlockYSize);
    5154             :                     }*/
    5155             :                 }
    5156             :             }
    5157             : 
    5158             :             /* --------------------------------------------------------------------
    5159             :              */
    5160             :             /*      Copy over this pixel of data. */
    5161             :             /* --------------------------------------------------------------------
    5162             :              */
    5163         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    5164         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    5165         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    5166         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    5167         270 :                               nBlockXSize) *
    5168         270 :                          nBandDataSize;
    5169             : 
    5170         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    5171             :             {
    5172         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    5173         710 :                 GPtrDiff_t iBandBufOffset =
    5174         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    5175             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    5176             : 
    5177         710 :                 if (eDataType == eBufType)
    5178             :                 {
    5179         710 :                     if (eRWFlag == GF_Read)
    5180         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    5181         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    5182             :                     else
    5183           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    5184             :                                static_cast<const GByte *>(pData) +
    5185           0 :                                    iBandBufOffset,
    5186             :                                nBandDataSize);
    5187             :                 }
    5188             :                 else
    5189             :                 {
    5190             :                     /* type to type conversion ... ouch, this is expensive way
    5191             :                        of handling single words */
    5192             : 
    5193           0 :                     if (eRWFlag == GF_Read)
    5194           0 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
    5195             :                                         static_cast<GByte *>(pData) +
    5196           0 :                                             iBandBufOffset,
    5197             :                                         eBufType, 0, 1);
    5198             :                     else
    5199           0 :                         GDALCopyWords64(static_cast<const GByte *>(pData) +
    5200           0 :                                             iBandBufOffset,
    5201           0 :                                         eBufType, 0, pabySrcBlock + iSrcOffset,
    5202             :                                         eDataType, 0, 1);
    5203             :                 }
    5204             :             }
    5205             : 
    5206         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    5207             :         }
    5208             :     }
    5209             : 
    5210             :     /* -------------------------------------------------------------------- */
    5211             :     /*      CleanupAndReturn.                                               */
    5212             :     /* -------------------------------------------------------------------- */
    5213           4 : CleanupAndReturn:
    5214           4 :     CPLFree(papabySrcBlock);
    5215           4 :     if (papoBlocks != nullptr)
    5216             :     {
    5217          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    5218             :         {
    5219           8 :             if (papoBlocks[iBand] != nullptr)
    5220           8 :                 papoBlocks[iBand]->DropLock();
    5221             :         }
    5222           4 :         CPLFree(papoBlocks);
    5223             :     }
    5224             : 
    5225           4 :     return eErr;
    5226             : }
    5227             : 
    5228             : //! @endcond
    5229             : 
    5230             : /************************************************************************/
    5231             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    5232             : /************************************************************************/
    5233             : 
    5234        3359 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    5235             :                                             GDALRasterBand *poDstPrototypeBand,
    5236             :                                             int nBandCount,
    5237             :                                             int bDstIsCompressed,
    5238             :                                             int bInterleave, int *pnSwathCols,
    5239             :                                             int *pnSwathLines)
    5240             : {
    5241        3359 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5242        3359 :     int nSrcBlockXSize = 0;
    5243        3359 :     int nSrcBlockYSize = 0;
    5244        3359 :     int nBlockXSize = 0;
    5245        3359 :     int nBlockYSize = 0;
    5246             : 
    5247        3359 :     int nXSize = poSrcPrototypeBand->GetXSize();
    5248        3359 :     int nYSize = poSrcPrototypeBand->GetYSize();
    5249             : 
    5250        3359 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    5251        3359 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    5252             : 
    5253        3359 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    5254        3359 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    5255             : 
    5256        3359 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5257        3359 :     if (bInterleave)
    5258         583 :         nPixelSize *= nBandCount;
    5259             : 
    5260             :     // aim for one row of blocks.  Do not settle for less.
    5261        3359 :     int nSwathCols = nXSize;
    5262        3359 :     int nSwathLines = nMaxBlockYSize;
    5263             : 
    5264             :     const char *pszSrcCompression =
    5265        3359 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    5266        3359 :     if (pszSrcCompression == nullptr)
    5267             :     {
    5268        3339 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    5269        3339 :         if (poSrcDS)
    5270             :             pszSrcCompression =
    5271        3333 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    5272             :     }
    5273             : 
    5274             :     /* -------------------------------------------------------------------- */
    5275             :     /*      What will our swath size be?                                    */
    5276             :     /* -------------------------------------------------------------------- */
    5277             :     // When writing interleaved data in a compressed format, we want to be sure
    5278             :     // that each block will only be written once, so the swath size must not be
    5279             :     // greater than the block cache.
    5280        3359 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    5281             :     int nTargetSwathSize;
    5282        3359 :     if (pszSwathSize != nullptr)
    5283           0 :         nTargetSwathSize = static_cast<int>(
    5284           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    5285             :     else
    5286             :     {
    5287             :         // As a default, take one 1/4 of the cache size.
    5288        3359 :         nTargetSwathSize = static_cast<int>(
    5289        3359 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    5290             : 
    5291             :         // but if the minimum idal swath buf size is less, then go for it to
    5292             :         // avoid unnecessarily abusing RAM usage.
    5293             :         // but try to use 10 MB at least.
    5294        3359 :         GIntBig nIdealSwathBufSize =
    5295        3359 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    5296        3359 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    5297             : 
    5298        3359 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    5299        3359 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    5300             :         {
    5301           1 :             nMinTargetSwathSize = nTargetSwathSize;
    5302             :         }
    5303             : 
    5304        3359 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    5305        3349 :             nIdealSwathBufSize < nMinTargetSwathSize)
    5306             :         {
    5307        3346 :             nIdealSwathBufSize = nMinTargetSwathSize;
    5308             :         }
    5309             : 
    5310        3359 :         if (pszSrcCompression != nullptr &&
    5311         178 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    5312           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    5313           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    5314             :         {
    5315           2 :             nIdealSwathBufSize =
    5316           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    5317           2 :                                                  nSrcBlockYSize * nPixelSize);
    5318             :         }
    5319        3359 :         if (nTargetSwathSize > nIdealSwathBufSize)
    5320        3346 :             nTargetSwathSize = static_cast<int>(
    5321        3346 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    5322             :     }
    5323             : 
    5324        3359 :     if (nTargetSwathSize < 1000000)
    5325           8 :         nTargetSwathSize = 1000000;
    5326             : 
    5327             :     /* But let's check that  */
    5328        3580 :     if (bDstIsCompressed && bInterleave &&
    5329         221 :         nTargetSwathSize > GDALGetCacheMax64())
    5330             :     {
    5331           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    5332             :                  "When translating into a compressed interleave format, "
    5333             :                  "the block cache size (" CPL_FRMT_GIB ") "
    5334             :                  "should be at least the size of the swath (%d) "
    5335             :                  "(GDAL_SWATH_SIZE config. option)",
    5336             :                  GDALGetCacheMax64(), nTargetSwathSize);
    5337             :     }
    5338             : 
    5339             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    5340             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    5341             : 
    5342             :     // if both input and output datasets are tiled, that the tile dimensions
    5343             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    5344             :     // of input and output block dimensions.
    5345        3359 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    5346          47 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    5347          47 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    5348          47 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5349          47 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5350             :     {
    5351          47 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    5352          47 :                 nPixelSize <=
    5353          47 :             static_cast<GIntBig>(nTargetSwathSize))
    5354             :         {
    5355          47 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    5356          47 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    5357          47 :             if (nSwathCols == 0)
    5358           0 :                 nSwathCols = nMaxBlockXSize;
    5359          47 :             if (nSwathCols > nXSize)
    5360          45 :                 nSwathCols = nXSize;
    5361          47 :             nSwathLines = nMaxBlockYSize;
    5362             : 
    5363          47 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    5364          47 :                 static_cast<GIntBig>(nTargetSwathSize))
    5365             :             {
    5366           0 :                 nSwathCols = nXSize;
    5367           0 :                 nSwathLines = nBlockYSize;
    5368             :             }
    5369             :         }
    5370             :     }
    5371             : 
    5372        3359 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    5373        3359 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    5374        3359 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    5375             :     {
    5376           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    5377           1 :         if (nSwathLines == 0)
    5378           1 :             nSwathLines = 1;
    5379             : 
    5380           1 :         CPLDebug(
    5381             :             "GDAL",
    5382             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    5383             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    5384             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    5385           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    5386             :     }
    5387             :     // If we are processing single scans, try to handle several at once.
    5388             :     // If we are handling swaths already, only grow the swath if a row
    5389             :     // of blocks is substantially less than our target buffer size.
    5390        3358 :     else if (nSwathLines == 1 ||
    5391        2807 :              nMemoryPerCol * nSwathLines <
    5392        2807 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    5393             :     {
    5394        3330 :         nSwathLines = std::min(
    5395             :             nYSize,
    5396        3330 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    5397             : 
    5398             :         /* If possible try to align to source and target block height */
    5399        3330 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    5400         273 :             nSwathLines > nMaxBlockYSize &&
    5401         273 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5402         244 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5403         217 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    5404             :     }
    5405             : 
    5406        3359 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    5407           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    5408           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    5409             :     {
    5410             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    5411           2 :         if (nSwathLines < nSrcBlockYSize)
    5412             :         {
    5413           0 :             nSwathLines = nSrcBlockYSize;
    5414             : 
    5415             :             // Number of pixels that can be read/write simultaneously.
    5416           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    5417           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    5418           0 :             if (nSwathCols == 0)
    5419           0 :                 nSwathCols = nSrcBlockXSize;
    5420           0 :             if (nSwathCols > nXSize)
    5421           0 :                 nSwathCols = nXSize;
    5422             : 
    5423           0 :             CPLDebug(
    5424             :                 "GDAL",
    5425             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5426             :                 "too high block, "
    5427             :                 "use partial width at one time");
    5428             :         }
    5429           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    5430             :         {
    5431             :             /* Round on a multiple of nSrcBlockYSize */
    5432           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    5433           0 :             CPLDebug(
    5434             :                 "GDAL",
    5435             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5436             :                 "round nSwathLines to block height : %d",
    5437             :                 nSwathLines);
    5438             :         }
    5439             :     }
    5440        3357 :     else if (bDstIsCompressed)
    5441             :     {
    5442         419 :         if (nSwathLines < nBlockYSize)
    5443             :         {
    5444         146 :             nSwathLines = nBlockYSize;
    5445             : 
    5446             :             // Number of pixels that can be read/write simultaneously.
    5447         146 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    5448         146 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    5449         146 :             if (nSwathCols == 0)
    5450           0 :                 nSwathCols = nBlockXSize;
    5451         146 :             if (nSwathCols > nXSize)
    5452         146 :                 nSwathCols = nXSize;
    5453             : 
    5454         146 :             CPLDebug(
    5455             :                 "GDAL",
    5456             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5457             :                 "too high block, "
    5458             :                 "use partial width at one time");
    5459             :         }
    5460         273 :         else if ((nSwathLines % nBlockYSize) != 0)
    5461             :         {
    5462             :             // Round on a multiple of nBlockYSize.
    5463           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    5464           9 :             CPLDebug(
    5465             :                 "GDAL",
    5466             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5467             :                 "round nSwathLines to block height : %d",
    5468             :                 nSwathLines);
    5469             :         }
    5470             :     }
    5471             : 
    5472        3359 :     *pnSwathCols = nSwathCols;
    5473        3359 :     *pnSwathLines = nSwathLines;
    5474        3359 : }
    5475             : 
    5476             : /************************************************************************/
    5477             : /*                     GDALDatasetCopyWholeRaster()                     */
    5478             : /************************************************************************/
    5479             : 
    5480             : /**
    5481             :  * \brief Copy all dataset raster data.
    5482             :  *
    5483             :  * This function copies the complete raster contents of one dataset to
    5484             :  * another similarly configured dataset.  The source and destination
    5485             :  * dataset must have the same number of bands, and the same width
    5486             :  * and height.  The bands do not have to have the same data type.
    5487             :  *
    5488             :  * This function is primarily intended to support implementation of
    5489             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    5490             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    5491             :  * performing the transfer in a pixel interleaved fashion.
    5492             :  *
    5493             :  * Currently the only papszOptions value supported are :
    5494             :  * <ul>
    5495             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    5496             :  * write access pattern (this does not modify the layout of the destination
    5497             :  * data)</li>
    5498             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block
    5499             :  * sizes to achieve best compression.</li>
    5500             :  * <li>"SKIP_HOLES=YES" to skip chunks
    5501             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    5502             :  * (GDAL &gt;= 2.2)</li>
    5503             :  * </ul>
    5504             :  * More options may be supported in the future.
    5505             :  *
    5506             :  * @param hSrcDS the source dataset
    5507             :  * @param hDstDS the destination dataset
    5508             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5509             :  * @param pfnProgress progress reporting function.
    5510             :  * @param pProgressData callback data for progress function.
    5511             :  *
    5512             :  * @return CE_None on success, or CE_Failure on failure.
    5513             :  */
    5514             : 
    5515        3331 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    5516             :                                               GDALDatasetH hDstDS,
    5517             :                                               CSLConstList papszOptions,
    5518             :                                               GDALProgressFunc pfnProgress,
    5519             :                                               void *pProgressData)
    5520             : 
    5521             : {
    5522        3331 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5523        3331 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5524             : 
    5525        3331 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    5526        3331 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    5527             : 
    5528        3331 :     if (pfnProgress == nullptr)
    5529           0 :         pfnProgress = GDALDummyProgress;
    5530             : 
    5531             :     /* -------------------------------------------------------------------- */
    5532             :     /*      Confirm the datasets match in size and band counts.             */
    5533             :     /* -------------------------------------------------------------------- */
    5534        3331 :     const int nXSize = poDstDS->GetRasterXSize();
    5535        3331 :     const int nYSize = poDstDS->GetRasterYSize();
    5536        3331 :     const int nBandCount = poDstDS->GetRasterCount();
    5537             : 
    5538        3331 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    5539        6662 :         poSrcDS->GetRasterYSize() != nYSize ||
    5540        3331 :         poSrcDS->GetRasterCount() != nBandCount)
    5541             :     {
    5542           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5543             :                  "Input and output dataset sizes or band counts do not\n"
    5544             :                  "match in GDALDatasetCopyWholeRaster()");
    5545           0 :         return CE_Failure;
    5546             :     }
    5547             : 
    5548             :     /* -------------------------------------------------------------------- */
    5549             :     /*      Report preliminary (0) progress.                                */
    5550             :     /* -------------------------------------------------------------------- */
    5551        3331 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5552             :     {
    5553           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5554             :                  "User terminated CreateCopy()");
    5555           1 :         return CE_Failure;
    5556             :     }
    5557             : 
    5558             :     /* -------------------------------------------------------------------- */
    5559             :     /*      Get our prototype band, and assume the others are similarly     */
    5560             :     /*      configured.                                                     */
    5561             :     /* -------------------------------------------------------------------- */
    5562        3330 :     if (nBandCount == 0)
    5563           0 :         return CE_None;
    5564             : 
    5565        3330 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    5566        3330 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    5567        3330 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5568             : 
    5569             :     /* -------------------------------------------------------------------- */
    5570             :     /*      Do we want to try and do the operation in a pixel               */
    5571             :     /*      interleaved fashion?                                            */
    5572             :     /* -------------------------------------------------------------------- */
    5573        3330 :     bool bInterleave = false;
    5574             :     const char *pszInterleave =
    5575        3330 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5576        3330 :     if (pszInterleave != nullptr &&
    5577        2926 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5578         209 :         bInterleave = true;
    5579             : 
    5580        3330 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    5581        3330 :     if (pszInterleave != nullptr &&
    5582        2865 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5583         528 :         bInterleave = true;
    5584             : 
    5585        3330 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    5586        3330 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    5587           5 :         bInterleave = true;
    5588        3325 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    5589          13 :         bInterleave = false;
    5590             :     // attributes is specific to the TileDB driver
    5591        3312 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    5592           4 :         bInterleave = true;
    5593        3308 :     else if (pszInterleave != nullptr)
    5594             :     {
    5595           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    5596             :                  "Unsupported value for option INTERLEAVE");
    5597             :     }
    5598             : 
    5599             :     // If the destination is compressed, we must try to write blocks just once,
    5600             :     // to save disk space (GTiff case for example), and to avoid data loss
    5601             :     // (JPEG compression for example).
    5602        3330 :     bool bDstIsCompressed = false;
    5603             :     const char *pszDstCompressed =
    5604        3330 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    5605        3330 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5606         393 :         bDstIsCompressed = true;
    5607             : 
    5608             :     /* -------------------------------------------------------------------- */
    5609             :     /*      What will our swath size be?                                    */
    5610             :     /* -------------------------------------------------------------------- */
    5611             : 
    5612        3330 :     int nSwathCols = 0;
    5613        3330 :     int nSwathLines = 0;
    5614        3330 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    5615             :                                     nBandCount, bDstIsCompressed, bInterleave,
    5616             :                                     &nSwathCols, &nSwathLines);
    5617             : 
    5618        3330 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5619        3330 :     if (bInterleave)
    5620         583 :         nPixelSize *= nBandCount;
    5621             : 
    5622        3330 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5623        3330 :     if (pSwathBuf == nullptr)
    5624             :     {
    5625           0 :         return CE_Failure;
    5626             :     }
    5627             : 
    5628        3330 :     CPLDebug("GDAL",
    5629             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    5630             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    5631             : 
    5632             :     // Advise the source raster that we are going to read it completely
    5633             :     // Note: this might already have been done by GDALCreateCopy() in the
    5634             :     // likely case this function is indirectly called by it
    5635        3330 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    5636        3330 :                         nullptr, nullptr);
    5637             : 
    5638             :     /* ==================================================================== */
    5639             :     /*      Band oriented (uninterleaved) case.                             */
    5640             :     /* ==================================================================== */
    5641        3330 :     CPLErr eErr = CE_None;
    5642             :     const bool bCheckHoles =
    5643        3330 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5644             : 
    5645        3330 :     if (!bInterleave)
    5646             :     {
    5647             :         GDALRasterIOExtraArg sExtraArg;
    5648        2747 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5649        2747 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5650             : 
    5651        8241 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    5652        2747 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    5653        2747 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    5654        2747 :         GIntBig nBlocksDone = 0;
    5655             : 
    5656        7934 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    5657             :         {
    5658        5187 :             int nBand = iBand + 1;
    5659             : 
    5660       10637 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5661             :             {
    5662        5450 :                 int nThisLines = nSwathLines;
    5663             : 
    5664        5450 :                 if (iY + nThisLines > nYSize)
    5665         368 :                     nThisLines = nYSize - iY;
    5666             : 
    5667       10900 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    5668        5450 :                      iX += nSwathCols)
    5669             :                 {
    5670        5450 :                     int nThisCols = nSwathCols;
    5671             : 
    5672        5450 :                     if (iX + nThisCols > nXSize)
    5673           0 :                         nThisCols = nXSize - iX;
    5674             : 
    5675        5450 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5676        5450 :                     if (bCheckHoles)
    5677             :                     {
    5678             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    5679        3744 :                                       ->GetDataCoverageStatus(
    5680             :                                           iX, iY, nThisCols, nThisLines,
    5681             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    5682             :                     }
    5683        5450 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5684             :                     {
    5685        5446 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    5686       10892 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    5687        5446 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    5688        5446 :                             (nBlocksDone + 0.5) /
    5689        5446 :                                 static_cast<double>(nTotalBlocks),
    5690             :                             pfnProgress, pProgressData);
    5691        5446 :                         if (sExtraArg.pProgressData == nullptr)
    5692        1676 :                             sExtraArg.pfnProgress = nullptr;
    5693             : 
    5694        5446 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5695             :                                                  nThisLines, pSwathBuf,
    5696             :                                                  nThisCols, nThisLines, eDT, 1,
    5697             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    5698             : 
    5699        5446 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5700             : 
    5701        5446 :                         if (eErr == CE_None)
    5702        5439 :                             eErr = poDstDS->RasterIO(
    5703             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    5704             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    5705             :                                 &nBand, 0, 0, 0, nullptr);
    5706             :                     }
    5707             : 
    5708        5450 :                     nBlocksDone++;
    5709       10858 :                     if (eErr == CE_None &&
    5710        5408 :                         !pfnProgress(nBlocksDone /
    5711        5408 :                                          static_cast<double>(nTotalBlocks),
    5712             :                                      nullptr, pProgressData))
    5713             :                     {
    5714           2 :                         eErr = CE_Failure;
    5715           2 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    5716             :                                  "User terminated CreateCopy()");
    5717             :                     }
    5718             :                 }
    5719             :             }
    5720             :         }
    5721             :     }
    5722             : 
    5723             :     /* ==================================================================== */
    5724             :     /*      Pixel interleaved case.                                         */
    5725             :     /* ==================================================================== */
    5726             :     else /* if( bInterleave ) */
    5727             :     {
    5728             :         GDALRasterIOExtraArg sExtraArg;
    5729         583 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5730         583 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5731             : 
    5732         583 :         const GIntBig nTotalBlocks =
    5733         583 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    5734         583 :             DIV_ROUND_UP(nXSize, nSwathCols);
    5735         583 :         GIntBig nBlocksDone = 0;
    5736             : 
    5737        1388 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5738             :         {
    5739         805 :             int nThisLines = nSwathLines;
    5740             : 
    5741         805 :             if (iY + nThisLines > nYSize)
    5742         198 :                 nThisLines = nYSize - iY;
    5743             : 
    5744        1615 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5745             :             {
    5746         810 :                 int nThisCols = nSwathCols;
    5747             : 
    5748         810 :                 if (iX + nThisCols > nXSize)
    5749           3 :                     nThisCols = nXSize - iX;
    5750             : 
    5751         810 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5752         810 :                 if (bCheckHoles)
    5753             :                 {
    5754         551 :                     nStatus = 0;
    5755         604 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    5756             :                     {
    5757         585 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    5758         585 :                                        ->GetDataCoverageStatus(
    5759             :                                            iX, iY, nThisCols, nThisLines,
    5760             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    5761         585 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5762         532 :                             break;
    5763             :                     }
    5764             :                 }
    5765         810 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5766             :                 {
    5767         791 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    5768        1582 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    5769         791 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    5770         791 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    5771             :                         pfnProgress, pProgressData);
    5772         791 :                     if (sExtraArg.pProgressData == nullptr)
    5773         375 :                         sExtraArg.pfnProgress = nullptr;
    5774             : 
    5775         791 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5776             :                                              nThisLines, pSwathBuf, nThisCols,
    5777             :                                              nThisLines, eDT, nBandCount,
    5778             :                                              nullptr, 0, 0, 0, &sExtraArg);
    5779             : 
    5780         791 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5781             : 
    5782         791 :                     if (eErr == CE_None)
    5783         790 :                         eErr = poDstDS->RasterIO(
    5784             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    5785             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    5786             :                             0, 0, nullptr);
    5787             :                 }
    5788             : 
    5789         810 :                 nBlocksDone++;
    5790        1615 :                 if (eErr == CE_None &&
    5791         805 :                     !pfnProgress(nBlocksDone /
    5792         805 :                                      static_cast<double>(nTotalBlocks),
    5793             :                                  nullptr, pProgressData))
    5794             :                 {
    5795           1 :                     eErr = CE_Failure;
    5796           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    5797             :                              "User terminated CreateCopy()");
    5798             :                 }
    5799             :             }
    5800             :         }
    5801             :     }
    5802             : 
    5803             :     /* -------------------------------------------------------------------- */
    5804             :     /*      Cleanup                                                         */
    5805             :     /* -------------------------------------------------------------------- */
    5806        3330 :     CPLFree(pSwathBuf);
    5807             : 
    5808        3330 :     return eErr;
    5809             : }
    5810             : 
    5811             : /************************************************************************/
    5812             : /*                   GDALRasterBandCopyWholeRaster()                    */
    5813             : /************************************************************************/
    5814             : 
    5815             : /**
    5816             :  * \brief Copy a whole raster band
    5817             :  *
    5818             :  * This function copies the complete raster contents of one band to
    5819             :  * another similarly configured band.  The source and destination
    5820             :  * bands must have the same width and height.  The bands do not have
    5821             :  * to have the same data type.
    5822             :  *
    5823             :  * It implements efficient copying, in particular "chunking" the copy in
    5824             :  * substantial blocks.
    5825             :  *
    5826             :  * Currently the only papszOptions value supported are :
    5827             :  * <ul>
    5828             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    5829             :  * achieve best compression.</li>
    5830             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    5831             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    5832             :  * </ul>
    5833             :  *
    5834             :  * @param hSrcBand the source band
    5835             :  * @param hDstBand the destination band
    5836             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5837             :  * @param pfnProgress progress reporting function.
    5838             :  * @param pProgressData callback data for progress function.
    5839             :  *
    5840             :  * @return CE_None on success, or CE_Failure on failure.
    5841             :  */
    5842             : 
    5843          29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    5844             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    5845             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    5846             :     void *pProgressData)
    5847             : 
    5848             : {
    5849          29 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5850          29 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5851             : 
    5852          29 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    5853          29 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    5854          29 :     CPLErr eErr = CE_None;
    5855             : 
    5856          29 :     if (pfnProgress == nullptr)
    5857           2 :         pfnProgress = GDALDummyProgress;
    5858             : 
    5859             :     /* -------------------------------------------------------------------- */
    5860             :     /*      Confirm the datasets match in size and band counts.             */
    5861             :     /* -------------------------------------------------------------------- */
    5862          29 :     int nXSize = poSrcBand->GetXSize();
    5863          29 :     int nYSize = poSrcBand->GetYSize();
    5864             : 
    5865          29 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    5866             :     {
    5867           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5868             :                  "Input and output band sizes do not\n"
    5869             :                  "match in GDALRasterBandCopyWholeRaster()");
    5870           0 :         return CE_Failure;
    5871             :     }
    5872             : 
    5873             :     /* -------------------------------------------------------------------- */
    5874             :     /*      Report preliminary (0) progress.                                */
    5875             :     /* -------------------------------------------------------------------- */
    5876          29 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5877             :     {
    5878           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5879             :                  "User terminated CreateCopy()");
    5880           0 :         return CE_Failure;
    5881             :     }
    5882             : 
    5883          29 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5884             : 
    5885             :     // If the destination is compressed, we must try to write blocks just once,
    5886             :     // to save disk space (GTiff case for example), and to avoid data loss
    5887             :     // (JPEG compression for example).
    5888          29 :     bool bDstIsCompressed = false;
    5889             :     const char *pszDstCompressed =
    5890          29 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5891          29 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5892          26 :         bDstIsCompressed = true;
    5893             : 
    5894             :     /* -------------------------------------------------------------------- */
    5895             :     /*      What will our swath size be?                                    */
    5896             :     /* -------------------------------------------------------------------- */
    5897             : 
    5898          29 :     int nSwathCols = 0;
    5899          29 :     int nSwathLines = 0;
    5900          29 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5901             :                                     FALSE, &nSwathCols, &nSwathLines);
    5902             : 
    5903          29 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5904             : 
    5905          29 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5906          29 :     if (pSwathBuf == nullptr)
    5907             :     {
    5908           0 :         return CE_Failure;
    5909             :     }
    5910             : 
    5911          29 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5912             :              nSwathCols, nSwathLines);
    5913             : 
    5914             :     const bool bCheckHoles =
    5915          29 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5916             : 
    5917             :     // Advise the source raster that we are going to read it completely
    5918          29 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    5919             : 
    5920             :     /* ==================================================================== */
    5921             :     /*      Band oriented (uninterleaved) case.                             */
    5922             :     /* ==================================================================== */
    5923             : 
    5924          72 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5925             :     {
    5926          43 :         int nThisLines = nSwathLines;
    5927             : 
    5928          43 :         if (iY + nThisLines > nYSize)
    5929           8 :             nThisLines = nYSize - iY;
    5930             : 
    5931          86 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5932             :         {
    5933          43 :             int nThisCols = nSwathCols;
    5934             : 
    5935          43 :             if (iX + nThisCols > nXSize)
    5936           0 :                 nThisCols = nXSize - iX;
    5937             : 
    5938          43 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5939          43 :             if (bCheckHoles)
    5940             :             {
    5941           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5942             :                     iX, iY, nThisCols, nThisLines,
    5943             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5944             :             }
    5945          43 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5946             :             {
    5947          43 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5948             :                                            nThisLines, pSwathBuf, nThisCols,
    5949             :                                            nThisLines, eDT, 0, 0, nullptr);
    5950             : 
    5951          43 :                 if (eErr == CE_None)
    5952          43 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5953             :                                                nThisLines, pSwathBuf, nThisCols,
    5954             :                                                nThisLines, eDT, 0, 0, nullptr);
    5955             :             }
    5956             : 
    5957          86 :             if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
    5958          43 :                                                     static_cast<double>(nYSize),
    5959             :                                                 nullptr, pProgressData))
    5960             :             {
    5961           0 :                 eErr = CE_Failure;
    5962           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5963             :                          "User terminated CreateCopy()");
    5964             :             }
    5965             :         }
    5966             :     }
    5967             : 
    5968             :     /* -------------------------------------------------------------------- */
    5969             :     /*      Cleanup                                                         */
    5970             :     /* -------------------------------------------------------------------- */
    5971          29 :     CPLFree(pSwathBuf);
    5972             : 
    5973          29 :     return eErr;
    5974             : }
    5975             : 
    5976             : /************************************************************************/
    5977             : /*                     GDALCopyRasterIOExtraArg ()                      */
    5978             : /************************************************************************/
    5979             : 
    5980      527312 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5981             :                               GDALRasterIOExtraArg *psSrcArg)
    5982             : {
    5983      527312 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5984      527312 :     if (psSrcArg)
    5985             :     {
    5986      527312 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5987      527312 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5988      527312 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5989      527312 :         psDestArg->bFloatingPointWindowValidity =
    5990      527312 :             psSrcArg->bFloatingPointWindowValidity;
    5991      527312 :         if (psSrcArg->bFloatingPointWindowValidity)
    5992             :         {
    5993      204393 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5994      204393 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5995      204393 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5996      204393 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5997             :         }
    5998      527312 :         if (psSrcArg->nVersion >= 2)
    5999             :         {
    6000      527312 :             psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
    6001             :         }
    6002             :     }
    6003      527312 : }
    6004             : 
    6005             : /************************************************************************/
    6006             : /*                           HasOnlyNoData()                            */
    6007             : /************************************************************************/
    6008             : 
    6009    50997376 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    6010             : {
    6011    50997376 :     return value == noDataValue;
    6012             : }
    6013             : 
    6014        5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
    6015             : {
    6016             :     using std::isnan;
    6017        5509 :     return isnan(noDataValue) ? isnan(value) : value == noDataValue;
    6018             : }
    6019             : 
    6020      251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    6021             : {
    6022      251221 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    6023             : }
    6024             : 
    6025      264257 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    6026             : {
    6027      264257 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    6028             : }
    6029             : 
    6030             : template <class T>
    6031       12015 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    6032             :                            size_t nHeight, size_t nLineStride,
    6033             :                            size_t nComponents)
    6034             : {
    6035             :     // Fast test: check the 4 corners and the middle pixel.
    6036       23278 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    6037             :     {
    6038       24077 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    6039       11871 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    6040       11741 :                               noDataValue) &&
    6041       11741 :               IsEqualToNoData(
    6042       11741 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    6043       11741 :                               nComponents +
    6044             :                           iBand],
    6045       11266 :                   noDataValue) &&
    6046       11266 :               IsEqualToNoData(
    6047       11266 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    6048             :                   noDataValue) &&
    6049       11266 :               IsEqualToNoData(
    6050       11266 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    6051       11266 :                               nComponents +
    6052             :                           iBand],
    6053             :                   noDataValue)))
    6054             :         {
    6055         943 :             return false;
    6056             :         }
    6057             :     }
    6058             : 
    6059             :     // Test all pixels.
    6060       51319 :     for (size_t iY = 0; iY < nHeight; iY++)
    6061             :     {
    6062       40368 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    6063    51500248 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    6064             :         {
    6065    51459915 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    6066             :             {
    6067         121 :                 return false;
    6068             :             }
    6069             :         }
    6070             :     }
    6071       10951 :     return true;
    6072             : }
    6073             : 
    6074             : /************************************************************************/
    6075             : /*                      GDALBufferHasOnlyNoData()                       */
    6076             : /************************************************************************/
    6077             : 
    6078       43882 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    6079             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    6080             :                              size_t nComponents, int nBitsPerSample,
    6081             :                              GDALBufferSampleFormat nSampleFormat)
    6082             : {
    6083             :     // In the case where the nodata is 0, we can compare several bytes at
    6084             :     // once. Select the largest natural integer type for the architecture.
    6085       43882 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6086             :         // Do not use this optimized code path for floating point numbers,
    6087             :         // as it can't detect negative zero.
    6088             :         nSampleFormat != GSF_FLOATING_POINT)
    6089             :     {
    6090       27247 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6091       27247 :         const size_t nSize =
    6092       27247 :             static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
    6093       27247 :                                      nComponents * nBitsPerSample +
    6094             :                                  7) /
    6095             :                                 8);
    6096             : #ifdef HAVE_SSE2
    6097       27247 :         size_t n = nSize;
    6098             :         // Align to 16 bytes
    6099       27310 :         while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
    6100             :         {
    6101          73 :             --n;
    6102          73 :             if (*pabyBuffer)
    6103          10 :                 return false;
    6104          63 :             pabyBuffer++;
    6105             :         }
    6106             : 
    6107       27237 :         const auto zero = _mm_setzero_si128();
    6108       27237 :         constexpr int UNROLLING = 4;
    6109     2217660 :         while (n >= UNROLLING * sizeof(zero))
    6110             :         {
    6111     2202420 :             const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6112             :                 pabyBuffer + 0 * sizeof(zero)));
    6113     2202420 :             const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6114     2202420 :                 pabyBuffer + 1 * sizeof(zero)));
    6115     2202420 :             const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6116     2202420 :                 pabyBuffer + 2 * sizeof(zero)));
    6117     2202420 :             const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6118     2202420 :                 pabyBuffer + 3 * sizeof(zero)));
    6119             :             const auto v =
    6120     6607260 :                 _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6121             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6122             :             if (!_mm_test_all_zeros(v, v))
    6123             : #else
    6124     4404840 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6125             : #endif
    6126             :             {
    6127       12001 :                 return false;
    6128             :             }
    6129     2190420 :             pabyBuffer += UNROLLING * sizeof(zero);
    6130     2190420 :             n -= UNROLLING * sizeof(zero);
    6131             :         }
    6132             : 
    6133      233657 :         while (n > 0)
    6134             :         {
    6135      218525 :             --n;
    6136      218525 :             if (*pabyBuffer)
    6137         104 :                 return false;
    6138      218421 :             pabyBuffer++;
    6139             :         }
    6140             : #else
    6141             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    6142             :         // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    6143             :         typedef std::uint64_t WordType;
    6144             : #else
    6145             :         typedef std::uint32_t WordType;
    6146             : #endif
    6147             : 
    6148             :         const size_t nInitialIters =
    6149             :             std::min(sizeof(WordType) -
    6150             :                          static_cast<size_t>(
    6151             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    6152             :                              sizeof(WordType)),
    6153             :                      nSize);
    6154             :         size_t i = 0;
    6155             :         for (; i < nInitialIters; i++)
    6156             :         {
    6157             :             if (pabyBuffer[i])
    6158             :                 return false;
    6159             :         }
    6160             :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    6161             :         {
    6162             :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    6163             :                 return false;
    6164             :         }
    6165             :         for (; i < nSize; i++)
    6166             :         {
    6167             :             if (pabyBuffer[i])
    6168             :                 return false;
    6169             :         }
    6170             : #endif
    6171       15132 :         return true;
    6172             :     }
    6173             : 
    6174             : #ifdef HAVE_SSE2
    6175       16635 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6176         708 :              nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6177             :     {
    6178         708 :         const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
    6179         708 :         const auto zero = _mm_setzero_si128();
    6180         708 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6181         708 :         const size_t n = nWidth * nHeight * nComponents;
    6182             : 
    6183         708 :         size_t i = 0;
    6184         708 :         constexpr int UNROLLING = 4;
    6185         708 :         constexpr size_t VALUES_PER_ITER =
    6186             :             UNROLLING * sizeof(zero) / sizeof(float);
    6187       24983 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    6188             :         {
    6189       24934 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6190             :                 pabyBuffer + 0 * sizeof(zero)));
    6191       24934 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6192       24934 :                 pabyBuffer + 1 * sizeof(zero)));
    6193       24934 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6194       24934 :                 pabyBuffer + 2 * sizeof(zero)));
    6195       24934 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6196       24934 :                 pabyBuffer + 3 * sizeof(zero)));
    6197       74802 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6198             :             // Clear the sign bit (makes -0.0 become +0.0)
    6199       24934 :             v = _mm_and_si128(v, signMask);
    6200             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6201             :             if (!_mm_test_all_zeros(v, v))
    6202             : #else
    6203       49868 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6204             : #endif
    6205             :             {
    6206         659 :                 return false;
    6207             :             }
    6208       24275 :             pabyBuffer += UNROLLING * sizeof(zero);
    6209             :         }
    6210             : 
    6211         304 :         for (; i < n; i++)
    6212             :         {
    6213             :             uint32_t bits;
    6214         272 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6215         272 :             pabyBuffer += sizeof(bits);
    6216         272 :             if ((bits & 0x7FFFFFFF) != 0)
    6217          17 :                 return false;
    6218             :         }
    6219             : 
    6220          32 :         return true;
    6221             :     }
    6222             : 
    6223       15927 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6224        3905 :              nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6225             :     {
    6226        3905 :         const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
    6227        3905 :         const auto zero = _mm_setzero_si128();
    6228        3905 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6229        3905 :         const size_t n = nWidth * nHeight * nComponents;
    6230             : 
    6231        3905 :         size_t i = 0;
    6232        3905 :         constexpr int UNROLLING = 4;
    6233        3905 :         constexpr size_t VALUES_PER_ITER =
    6234             :             UNROLLING * sizeof(zero) / sizeof(double);
    6235     1664570 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    6236             :         {
    6237     1660950 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6238             :                 pabyBuffer + 0 * sizeof(zero)));
    6239     1660950 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6240     1660950 :                 pabyBuffer + 1 * sizeof(zero)));
    6241     1660950 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6242     1660950 :                 pabyBuffer + 2 * sizeof(zero)));
    6243     1660950 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6244     1660950 :                 pabyBuffer + 3 * sizeof(zero)));
    6245     4982850 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6246             :             // Clear the sign bit (makes -0.0 become +0.0)
    6247     1660950 :             v = _mm_and_si128(v, signMask);
    6248             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
    6249             :             if (!_mm_test_all_zeros(v, v))
    6250             : #else
    6251     3321900 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6252             : #endif
    6253             :             {
    6254         289 :                 return false;
    6255             :             }
    6256     1660660 :             pabyBuffer += UNROLLING * sizeof(zero);
    6257             :         }
    6258             : 
    6259        3643 :         for (; i < n; i++)
    6260             :         {
    6261             :             uint64_t bits;
    6262          34 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6263          34 :             pabyBuffer += sizeof(bits);
    6264          34 :             if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
    6265           7 :                 return false;
    6266             :         }
    6267             : 
    6268        3609 :         return true;
    6269             :     }
    6270             : #endif
    6271             : 
    6272       12022 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    6273             :     {
    6274       22406 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    6275       11203 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    6276       11203 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    6277       11203 :                               nHeight, nLineStride, nComponents);
    6278             :     }
    6279         819 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    6280             :     {
    6281             :         // Use unsigned implementation by converting the nodatavalue to
    6282             :         // unsigned
    6283         119 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    6284          59 :                HasOnlyNoDataT(
    6285             :                    static_cast<const uint8_t *>(pBuffer),
    6286          59 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    6287          60 :                    nWidth, nHeight, nLineStride, nComponents);
    6288             :     }
    6289         759 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    6290             :     {
    6291          23 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    6292          11 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    6293          11 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    6294          12 :                               nHeight, nLineStride, nComponents);
    6295             :     }
    6296         747 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    6297             :     {
    6298             :         // Use unsigned implementation by converting the nodatavalue to
    6299             :         // unsigned
    6300         111 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    6301          55 :                HasOnlyNoDataT(
    6302             :                    static_cast<const uint16_t *>(pBuffer),
    6303          55 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    6304          56 :                    nWidth, nHeight, nLineStride, nComponents);
    6305             :     }
    6306         691 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    6307             :     {
    6308         129 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    6309          64 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    6310             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    6311          65 :                               nHeight, nLineStride, nComponents);
    6312             :     }
    6313         626 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    6314             :     {
    6315             :         // Use unsigned implementation by converting the nodatavalue to
    6316             :         // unsigned
    6317          23 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    6318          11 :                HasOnlyNoDataT(
    6319             :                    static_cast<const uint32_t *>(pBuffer),
    6320          11 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    6321          12 :                    nWidth, nHeight, nLineStride, nComponents);
    6322             :     }
    6323         614 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    6324             :     {
    6325         112 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    6326          56 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    6327             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    6328          56 :                               nHeight, nLineStride, nComponents);
    6329             :     }
    6330         558 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    6331             :     {
    6332             :         // Use unsigned implementation by converting the nodatavalue to
    6333             :         // unsigned
    6334           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    6335           0 :                HasOnlyNoDataT(
    6336             :                    static_cast<const uint64_t *>(pBuffer),
    6337           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    6338           0 :                    nWidth, nHeight, nLineStride, nComponents);
    6339             :     }
    6340         558 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
    6341             :     {
    6342         106 :         return (std::isnan(dfNoDataValue) ||
    6343         211 :                 GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
    6344         105 :                HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
    6345             :                               static_cast<GFloat16>(dfNoDataValue), nWidth,
    6346         106 :                               nHeight, nLineStride, nComponents);
    6347             :     }
    6348         452 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6349             :     {
    6350         268 :         return (std::isnan(dfNoDataValue) ||
    6351         535 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    6352         267 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    6353             :                               static_cast<float>(dfNoDataValue), nWidth,
    6354         268 :                               nHeight, nLineStride, nComponents);
    6355             :     }
    6356         184 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6357             :     {
    6358         184 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    6359             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    6360         184 :                               nComponents);
    6361             :     }
    6362           0 :     return false;
    6363             : }
    6364             : 
    6365             : #ifdef HAVE_SSE2
    6366             : 
    6367             : /************************************************************************/
    6368             : /*                       GDALDeinterleave3Byte()                        */
    6369             : /************************************************************************/
    6370             : 
    6371             : #if defined(__GNUC__) && !defined(__clang__)
    6372             : __attribute__((optimize("no-tree-vectorize")))
    6373             : #endif
    6374      380714 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6375             :                                   GByte *CPL_RESTRICT pabyDest0,
    6376             :                                   GByte *CPL_RESTRICT pabyDest1,
    6377             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6378             : #ifdef USE_NEON_OPTIMIZATIONS
    6379             : {
    6380             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6381             :                                        nIters);
    6382             : }
    6383             : #else
    6384             : {
    6385             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6386      380714 :     if (CPLHaveRuntimeSSSE3())
    6387             :     {
    6388      380712 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6389      380712 :                                            pabyDest2, nIters);
    6390             :     }
    6391             : #endif
    6392             : 
    6393           2 :     size_t i = 0;
    6394           2 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    6395           2 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    6396           2 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    6397           2 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    6398             :          sizeof(unsigned int)) == 0)
    6399             :     {
    6400             :         // Slightly better than GCC autovectorizer
    6401          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    6402             :         {
    6403          15 :             unsigned int word0 =
    6404          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    6405          15 :             unsigned int word1 =
    6406          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    6407          15 :             unsigned int word2 =
    6408          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    6409          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    6410          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    6411          15 :                 ((word2 >> 8) << 24);
    6412          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    6413          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    6414          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    6415          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    6416          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    6417          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    6418          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    6419             :         }
    6420             :     }
    6421             : #if defined(__clang__)
    6422             : #pragma clang loop vectorize(disable)
    6423             : #endif
    6424           3 :     for (; i < nIters; ++i)
    6425             :     {
    6426           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    6427           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    6428           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    6429             :     }
    6430             : }
    6431             : #endif
    6432             : 
    6433             : /************************************************************************/
    6434             : /*                       GDALDeinterleave4Byte()                        */
    6435             : /************************************************************************/
    6436             : 
    6437             : #if !defined(__GNUC__) || defined(__clang__)
    6438             : 
    6439             : /************************************************************************/
    6440             : /*                            deinterleave()                            */
    6441             : /************************************************************************/
    6442             : 
    6443             : template <bool SHIFT, bool MASK>
    6444             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    6445             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    6446             : {
    6447             :     // Set higher 24bit of each int32 packed word to 0
    6448             :     if (SHIFT)
    6449             :     {
    6450             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    6451             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    6452             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    6453             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    6454             :     }
    6455             :     __m128i xmm0;
    6456             :     __m128i xmm1;
    6457             :     __m128i xmm2;
    6458             :     __m128i xmm3;
    6459             :     if (MASK)
    6460             :     {
    6461             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    6462             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    6463             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    6464             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    6465             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    6466             :     }
    6467             :     else
    6468             :     {
    6469             :         xmm0 = xmm0_ori;
    6470             :         xmm1 = xmm1_ori;
    6471             :         xmm2 = xmm2_ori;
    6472             :         xmm3 = xmm3_ori;
    6473             :     }
    6474             :     // Pack int32 to int16
    6475             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    6476             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    6477             :     // Pack int16 to uint8
    6478             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    6479             :     return xmm0;
    6480             : }
    6481             : 
    6482             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6483             :                                   GByte *CPL_RESTRICT pabyDest0,
    6484             :                                   GByte *CPL_RESTRICT pabyDest1,
    6485             :                                   GByte *CPL_RESTRICT pabyDest2,
    6486             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6487             : #ifdef USE_NEON_OPTIMIZATIONS
    6488             : {
    6489             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6490             :                                        pabyDest3, nIters);
    6491             : }
    6492             : #else
    6493             : {
    6494             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6495             :     if (CPLHaveRuntimeSSSE3())
    6496             :     {
    6497             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6498             :                                            pabyDest2, pabyDest3, nIters);
    6499             :     }
    6500             : #endif
    6501             : 
    6502             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    6503             :     // do something slightly better.
    6504             :     size_t i = 0;
    6505             :     for (; i + 15 < nIters; i += 16)
    6506             :     {
    6507             :         __m128i xmm0_ori = _mm_loadu_si128(
    6508             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    6509             :         __m128i xmm1_ori = _mm_loadu_si128(
    6510             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    6511             :         __m128i xmm2_ori = _mm_loadu_si128(
    6512             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    6513             :         __m128i xmm3_ori = _mm_loadu_si128(
    6514             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    6515             : 
    6516             :         _mm_storeu_si128(
    6517             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    6518             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6519             :         _mm_storeu_si128(
    6520             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    6521             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6522             :         _mm_storeu_si128(
    6523             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    6524             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6525             :         _mm_storeu_si128(
    6526             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    6527             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6528             :     }
    6529             : 
    6530             : #if defined(__clang__)
    6531             : #pragma clang loop vectorize(disable)
    6532             : #endif
    6533             :     for (; i < nIters; ++i)
    6534             :     {
    6535             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6536             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6537             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6538             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6539             :     }
    6540             : }
    6541             : #endif
    6542             : #else
    6543             : // GCC autovectorizer does an excellent job
    6544       73229 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    6545             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    6546             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    6547             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6548             : {
    6549   540369000 :     for (size_t i = 0; i < nIters; ++i)
    6550             :     {
    6551   540295000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    6552   540295000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    6553   540295000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    6554   540295000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    6555             :     }
    6556       73229 : }
    6557             : #endif
    6558             : 
    6559             : #else
    6560             : 
    6561             : /************************************************************************/
    6562             : /*                       GDALDeinterleave3Byte()                        */
    6563             : /************************************************************************/
    6564             : 
    6565             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    6566             : // how to auto-vectorize
    6567             : // #if defined(__GNUC__)
    6568             : //__attribute__((optimize("tree-vectorize")))
    6569             : // #endif
    6570             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6571             :                                   GByte *CPL_RESTRICT pabyDest0,
    6572             :                                   GByte *CPL_RESTRICT pabyDest1,
    6573             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6574             : {
    6575             :     for (size_t i = 0; i < nIters; ++i)
    6576             :     {
    6577             :         pabyDest0[i] = pabySrc[3 * i + 0];
    6578             :         pabyDest1[i] = pabySrc[3 * i + 1];
    6579             :         pabyDest2[i] = pabySrc[3 * i + 2];
    6580             :     }
    6581             : }
    6582             : 
    6583             : /************************************************************************/
    6584             : /*                       GDALDeinterleave4Byte()                        */
    6585             : /************************************************************************/
    6586             : 
    6587             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    6588             : // how to auto-vectorize
    6589             : // #if defined(__GNUC__)
    6590             : //__attribute__((optimize("tree-vectorize")))
    6591             : // #endif
    6592             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6593             :                                   GByte *CPL_RESTRICT pabyDest0,
    6594             :                                   GByte *CPL_RESTRICT pabyDest1,
    6595             :                                   GByte *CPL_RESTRICT pabyDest2,
    6596             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6597             : {
    6598             :     for (size_t i = 0; i < nIters; ++i)
    6599             :     {
    6600             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6601             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6602             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6603             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6604             :     }
    6605             : }
    6606             : 
    6607             : #endif
    6608             : 
    6609             : /************************************************************************/
    6610             : /*                          GDALDeinterleave()                          */
    6611             : /************************************************************************/
    6612             : 
    6613             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    6614             :     buffers.
    6615             : 
    6616             :     In pseudo-code
    6617             :     \verbatim
    6618             :     for(size_t i = 0; i < nIters; ++i)
    6619             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    6620             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    6621             :     \endverbatim
    6622             : 
    6623             :     The implementation is optimized for a few cases, like de-interleaving
    6624             :     of 3 or 4-components Byte buffers.
    6625             : 
    6626             :     \since GDAL 3.6
    6627             :  */
    6628      454293 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    6629             :                       int nComponents, void **ppDestBuffer,
    6630             :                       GDALDataType eDestDT, size_t nIters)
    6631             : {
    6632      454293 :     if (eSourceDT == eDestDT)
    6633             :     {
    6634      454271 :         if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
    6635             :         {
    6636      453950 :             if (nComponents == 3)
    6637             :             {
    6638      380714 :                 const GByte *CPL_RESTRICT pabySrc =
    6639             :                     static_cast<const GByte *>(pSourceBuffer);
    6640      380714 :                 GByte *CPL_RESTRICT pabyDest0 =
    6641             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6642      380714 :                 GByte *CPL_RESTRICT pabyDest1 =
    6643             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6644      380714 :                 GByte *CPL_RESTRICT pabyDest2 =
    6645             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6646      380714 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6647             :                                       nIters);
    6648      380714 :                 return;
    6649             :             }
    6650       73236 :             else if (nComponents == 4)
    6651             :             {
    6652       73229 :                 const GByte *CPL_RESTRICT pabySrc =
    6653             :                     static_cast<const GByte *>(pSourceBuffer);
    6654       73229 :                 GByte *CPL_RESTRICT pabyDest0 =
    6655             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6656       73229 :                 GByte *CPL_RESTRICT pabyDest1 =
    6657             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6658       73229 :                 GByte *CPL_RESTRICT pabyDest2 =
    6659             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6660       73229 :                 GByte *CPL_RESTRICT pabyDest3 =
    6661             :                     static_cast<GByte *>(ppDestBuffer[3]);
    6662       73229 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6663             :                                       pabyDest3, nIters);
    6664       73229 :                 return;
    6665           7 :             }
    6666             :         }
    6667             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    6668             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    6669             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    6670         642 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    6671         321 :                  CPLHaveRuntimeSSSE3())
    6672             :         {
    6673         321 :             if (nComponents == 3)
    6674             :             {
    6675         126 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6676             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6677         126 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6678             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6679         126 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6680             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6681         126 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6682             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6683         126 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    6684             :                                               panDest2, nIters);
    6685         126 :                 return;
    6686             :             }
    6687             : #if !defined(__INTEL_CLANG_COMPILER)
    6688             :             // ICC autovectorizer doesn't do a good job, at least with icx
    6689             :             // 2022.1.0.20220316
    6690         195 :             else if (nComponents == 4)
    6691             :             {
    6692         195 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6693             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6694         195 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6695             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6696         195 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6697             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6698         195 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6699             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6700         195 :                 GUInt16 *CPL_RESTRICT panDest3 =
    6701             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    6702         195 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    6703             :                                               panDest2, panDest3, nIters);
    6704         195 :                 return;
    6705             :             }
    6706             : #endif
    6707             :         }
    6708             : #endif
    6709             :     }
    6710             : 
    6711          29 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    6712          29 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    6713         108 :     for (int iComp = 0; iComp < nComponents; iComp++)
    6714             :     {
    6715          79 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    6716          79 :                             iComp * nSourceDTSize,
    6717             :                         eSourceDT, nComponents * nSourceDTSize,
    6718          79 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    6719             :     }
    6720             : }
    6721             : 
    6722             : /************************************************************************/
    6723             : /*                   GDALTranspose2DSingleToSingle()                    */
    6724             : /************************************************************************/
    6725             : /**
    6726             :  * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
    6727             :  *
    6728             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6729             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6730             :  * @param nSrcWidth Width of pSrc array.
    6731             :  * @param nSrcHeight Height of pSrc array.
    6732             :  */
    6733             : 
    6734             : template <class DST, class SRC>
    6735         160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
    6736             :                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6737             :                                    size_t nSrcHeight)
    6738             : {
    6739         160 :     constexpr size_t blocksize = 32;
    6740         345 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6741             :     {
    6742         185 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6743        5016 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6744             :         {
    6745             :             // transpose the block beginning at [i,j]
    6746        4831 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6747       26185 :             for (size_t k = i; k < max_k; ++k)
    6748             :             {
    6749      669282 :                 for (size_t l = j; l < max_l; ++l)
    6750             :                 {
    6751      647928 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6752      647928 :                                  pDst[k + l * nSrcHeight]);
    6753             :                 }
    6754             :             }
    6755             :         }
    6756             :     }
    6757         160 : }
    6758             : 
    6759             : /************************************************************************/
    6760             : /*                  GDALTranspose2DComplexToComplex()                   */
    6761             : /************************************************************************/
    6762             : /**
    6763             :  * Transpose a 2D array of complex values into an array of complex values,
    6764             :  * in a efficient (cache-oblivious) way.
    6765             :  *
    6766             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6767             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6768             :  * @param nSrcWidth Width of pSrc array.
    6769             :  * @param nSrcHeight Height of pSrc array.
    6770             :  */
    6771             : template <class DST, class SRC>
    6772          25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
    6773             :                                      DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6774             :                                      size_t nSrcHeight)
    6775             : {
    6776          25 :     constexpr size_t blocksize = 32;
    6777          50 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6778             :     {
    6779          25 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6780          50 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6781             :         {
    6782             :             // transpose the block beginning at [i,j]
    6783          25 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6784          75 :             for (size_t k = i; k < max_k; ++k)
    6785             :             {
    6786         200 :                 for (size_t l = j; l < max_l; ++l)
    6787             :                 {
    6788         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6789         150 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6790         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
    6791         150 :                                  pDst[2 * (k + l * nSrcHeight) + 1]);
    6792             :                 }
    6793             :             }
    6794             :         }
    6795             :     }
    6796          25 : }
    6797             : 
    6798             : /************************************************************************/
    6799             : /*                   GDALTranspose2DComplexToSingle()                   */
    6800             : /************************************************************************/
    6801             : /**
    6802             :  * Transpose a 2D array of complex values into an array of non-complex values,
    6803             :  * in a efficient (cache-oblivious) way.
    6804             :  *
    6805             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6806             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6807             :  * @param nSrcWidth Width of pSrc array.
    6808             :  * @param nSrcHeight Height of pSrc array.
    6809             :  */
    6810             : template <class DST, class SRC>
    6811          55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
    6812             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6813             :                                     size_t nSrcHeight)
    6814             : {
    6815          55 :     constexpr size_t blocksize = 32;
    6816         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6817             :     {
    6818          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6819         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6820             :         {
    6821             :             // transpose the block beginning at [i,j]
    6822          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6823         165 :             for (size_t k = i; k < max_k; ++k)
    6824             :             {
    6825         440 :                 for (size_t l = j; l < max_l; ++l)
    6826             :                 {
    6827         330 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6828         330 :                                  pDst[k + l * nSrcHeight]);
    6829             :                 }
    6830             :             }
    6831             :         }
    6832             :     }
    6833          55 : }
    6834             : 
    6835             : /************************************************************************/
    6836             : /*                   GDALTranspose2DSingleToComplex()                   */
    6837             : /************************************************************************/
    6838             : /**
    6839             :  * Transpose a 2D array of non-complex values into an array of complex values,
    6840             :  * in a efficient (cache-oblivious) way.
    6841             :  *
    6842             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6843             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6844             :  * @param nSrcWidth Width of pSrc array.
    6845             :  * @param nSrcHeight Height of pSrc array.
    6846             :  */
    6847             : template <class DST, class SRC>
    6848          55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
    6849             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6850             :                                     size_t nSrcHeight)
    6851             : {
    6852          55 :     constexpr size_t blocksize = 32;
    6853         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6854             :     {
    6855          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6856         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6857             :         {
    6858             :             // transpose the block beginning at [i,j]
    6859          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6860         165 :             for (size_t k = i; k < max_k; ++k)
    6861             :             {
    6862         440 :                 for (size_t l = j; l < max_l; ++l)
    6863             :                 {
    6864         330 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6865         330 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6866         330 :                     pDst[2 * (k + l * nSrcHeight) + 1] = 0;
    6867             :                 }
    6868             :             }
    6869             :         }
    6870             :     }
    6871          55 : }
    6872             : 
    6873             : /************************************************************************/
    6874             : /*                          GDALTranspose2D()                           */
    6875             : /************************************************************************/
    6876             : 
    6877             : template <class DST, bool DST_IS_COMPLEX>
    6878         295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
    6879             :                             size_t nSrcWidth, size_t nSrcHeight)
    6880             : {
    6881             : #define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
    6882             :     do                                                                         \
    6883             :     {                                                                          \
    6884             :         if constexpr (DST_IS_COMPLEX)                                          \
    6885             :         {                                                                      \
    6886             :             GDALTranspose2DSingleToComplex(                                    \
    6887             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6888             :                 nSrcHeight);                                                   \
    6889             :         }                                                                      \
    6890             :         else                                                                   \
    6891             :         {                                                                      \
    6892             :             GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
    6893             :                                           pDst, nSrcWidth, nSrcHeight);        \
    6894             :         }                                                                      \
    6895             :     } while (0)
    6896             : 
    6897             : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
    6898             :     do                                                                         \
    6899             :     {                                                                          \
    6900             :         if constexpr (DST_IS_COMPLEX)                                          \
    6901             :         {                                                                      \
    6902             :             GDALTranspose2DComplexToComplex(                                   \
    6903             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6904             :                 nSrcHeight);                                                   \
    6905             :         }                                                                      \
    6906             :         else                                                                   \
    6907             :         {                                                                      \
    6908             :             GDALTranspose2DComplexToSingle(                                    \
    6909             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6910             :                 nSrcHeight);                                                   \
    6911             :         }                                                                      \
    6912             :     } while (0)
    6913             : 
    6914             :     // clang-format off
    6915         295 :     switch (eSrcType)
    6916             :     {
    6917          16 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t); break;
    6918          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
    6919          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
    6920          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
    6921          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
    6922          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
    6923          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
    6924          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
    6925          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
    6926          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
    6927          24 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
    6928          16 :         case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
    6929          16 :         case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
    6930          16 :         case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
    6931          16 :         case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
    6932          16 :         case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
    6933           0 :         case GDT_Unknown:
    6934             :         case GDT_TypeCount:
    6935           0 :             break;
    6936             :     }
    6937             :         // clang-format on
    6938             : 
    6939             : #undef CALL_GDALTranspose2D_internal
    6940             : #undef CALL_GDALTranspose2DComplex_internal
    6941         295 : }
    6942             : 
    6943             : /************************************************************************/
    6944             : /*                        GDALInterleave2Byte()                         */
    6945             : /************************************************************************/
    6946             : 
    6947             : #if defined(HAVE_SSE2) &&                                                      \
    6948             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6949             : 
    6950             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6951             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6952             : #if defined(__GNUC__)
    6953             : __attribute__((noinline))
    6954             : #endif
    6955             : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6956             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6957             : {
    6958             :     size_t i = 0;
    6959             :     constexpr size_t VALS_PER_ITER = 16;
    6960             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6961             :     {
    6962             :         __m128i xmm0 =
    6963             :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
    6964             :         __m128i xmm1 = _mm_loadu_si128(
    6965             :             reinterpret_cast<__m128i const *>(pSrc + i + nIters));
    6966             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
    6967             :                          _mm_unpacklo_epi8(xmm0, xmm1));
    6968             :         _mm_storeu_si128(
    6969             :             reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
    6970             :             _mm_unpackhi_epi8(xmm0, xmm1));
    6971             :     }
    6972             : #if defined(__clang__)
    6973             : #pragma clang loop vectorize(disable)
    6974             : #endif
    6975             :     for (; i < nIters; ++i)
    6976             :     {
    6977             :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6978             :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6979             :     }
    6980             : }
    6981             : 
    6982             : #else
    6983             : 
    6984             : #if defined(__GNUC__) && !defined(__clang__)
    6985             : __attribute__((optimize("tree-vectorize")))
    6986             : #endif
    6987             : #if defined(__GNUC__)
    6988             : __attribute__((noinline))
    6989             : #endif
    6990             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6991             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6992             : #pragma clang diagnostic push
    6993             : #pragma clang diagnostic ignored "-Wpass-failed"
    6994             : #endif
    6995           9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6996             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6997             : {
    6998             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6999             : #pragma clang loop vectorize(enable)
    7000             : #endif
    7001      355429 :     for (size_t i = 0; i < nIters; ++i)
    7002             :     {
    7003      355420 :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    7004      355420 :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    7005             :     }
    7006           9 : }
    7007             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7008             : #pragma clang diagnostic pop
    7009             : #endif
    7010             : 
    7011             : #endif
    7012             : 
    7013             : /************************************************************************/
    7014             : /*                        GDALInterleave4Byte()                         */
    7015             : /************************************************************************/
    7016             : 
    7017             : #if defined(HAVE_SSE2) &&                                                      \
    7018             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    7019             : 
    7020             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    7021             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    7022             : #if defined(__GNUC__)
    7023             : __attribute__((noinline))
    7024             : #endif
    7025             : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    7026             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    7027             : {
    7028             :     size_t i = 0;
    7029             :     constexpr size_t VALS_PER_ITER = 16;
    7030             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    7031             :     {
    7032             :         __m128i xmm0 = _mm_loadu_si128(
    7033             :             reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
    7034             :         __m128i xmm1 = _mm_loadu_si128(
    7035             :             reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
    7036             :         __m128i xmm2 = _mm_loadu_si128(
    7037             :             reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
    7038             :         __m128i xmm3 = _mm_loadu_si128(
    7039             :             reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
    7040             :         auto tmp0 = _mm_unpacklo_epi8(
    7041             :             xmm0,
    7042             :             xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
    7043             :         auto tmp1 = _mm_unpackhi_epi8(
    7044             :             xmm0,
    7045             :             xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
    7046             :         auto tmp2 = _mm_unpacklo_epi8(
    7047             :             xmm2,
    7048             :             xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
    7049             :         auto tmp3 = _mm_unpackhi_epi8(
    7050             :             xmm2,
    7051             :             xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
    7052             :         auto tmp2_0 = _mm_unpacklo_epi16(
    7053             :             tmp0,
    7054             :             tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
    7055             :         auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
    7056             :         auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
    7057             :         auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
    7058             :         _mm_storeu_si128(
    7059             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
    7060             :             tmp2_0);
    7061             :         _mm_storeu_si128(
    7062             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
    7063             :             tmp2_1);
    7064             :         _mm_storeu_si128(
    7065             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
    7066             :             tmp2_2);
    7067             :         _mm_storeu_si128(
    7068             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
    7069             :             tmp2_3);
    7070             :     }
    7071             : #if defined(__clang__)
    7072             : #pragma clang loop vectorize(disable)
    7073             : #endif
    7074             :     for (; i < nIters; ++i)
    7075             :     {
    7076             :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    7077             :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    7078             :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    7079             :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    7080             :     }
    7081             : }
    7082             : 
    7083             : #else
    7084             : 
    7085             : #if defined(__GNUC__) && !defined(__clang__)
    7086             : __attribute__((optimize("tree-vectorize")))
    7087             : #endif
    7088             : #if defined(__GNUC__)
    7089             : __attribute__((noinline))
    7090             : #endif
    7091             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7092             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    7093             : #pragma clang diagnostic push
    7094             : #pragma clang diagnostic ignored "-Wpass-failed"
    7095             : #endif
    7096          30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    7097             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    7098             : {
    7099             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7100             : #pragma clang loop vectorize(enable)
    7101             : #endif
    7102    49620700 :     for (size_t i = 0; i < nIters; ++i)
    7103             :     {
    7104    49620600 :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    7105    49620600 :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    7106    49620600 :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    7107    49620600 :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    7108             :     }
    7109          30 : }
    7110             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7111             : #pragma clang diagnostic pop
    7112             : #endif
    7113             : 
    7114             : #endif
    7115             : 
    7116             : /************************************************************************/
    7117             : /*                          GDALTranspose2D()                           */
    7118             : /************************************************************************/
    7119             : 
    7120             : /**
    7121             :  * Transpose a 2D array in a efficient (cache-oblivious) way.
    7122             :  *
    7123             :  * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
    7124             :  * @param eSrcType Data type of pSrc.
    7125             :  * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
    7126             :  * @param eDstType Data type of pDst.
    7127             :  * @param nSrcWidth Width of pSrc array.
    7128             :  * @param nSrcHeight Height of pSrc array.
    7129             :  * @since GDAL 3.11
    7130             :  */
    7131             : 
    7132         365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
    7133             :                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
    7134             : {
    7135         365 :     if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
    7136             :     {
    7137          70 :         if (nSrcHeight == 2)
    7138             :         {
    7139           9 :             GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
    7140             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    7141           9 :             return;
    7142             :         }
    7143          61 :         if (nSrcHeight == 4)
    7144             :         {
    7145          30 :             GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
    7146             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    7147          30 :             return;
    7148             :         }
    7149             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
    7150             :      (defined(__x86_64) || defined(_M_X64)))
    7151          31 :         if (CPLHaveRuntimeSSSE3())
    7152             :         {
    7153          31 :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    7154             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    7155             :                                        nSrcHeight);
    7156          31 :             return;
    7157             :         }
    7158             : #elif defined(USE_NEON_OPTIMIZATIONS)
    7159             :         {
    7160             :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    7161             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    7162             :                                        nSrcHeight);
    7163             :             return;
    7164             :         }
    7165             : #endif
    7166             :     }
    7167             : 
    7168             : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
    7169             :     GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
    7170             :         pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
    7171             : 
    7172             :     // clang-format off
    7173         295 :     switch (eDstType)
    7174             :     {
    7175          15 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
    7176          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
    7177          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
    7178          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
    7179          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
    7180          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
    7181          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
    7182          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
    7183          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
    7184          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
    7185          25 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
    7186          16 :         case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
    7187          16 :         case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
    7188          16 :         case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
    7189          16 :         case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
    7190          16 :         case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
    7191           0 :         case GDT_Unknown:
    7192             :         case GDT_TypeCount:
    7193           0 :             break;
    7194             :     }
    7195             :         // clang-format on
    7196             : 
    7197             : #undef CALL_GDALTranspose2D_internal
    7198             : }
    7199             : 
    7200             : /************************************************************************/
    7201             : /*                     ExtractBitAndConvertTo255()                      */
    7202             : /************************************************************************/
    7203             : 
    7204             : #if defined(__GNUC__) || defined(_MSC_VER)
    7205             : // Signedness of char implementation dependent, so be explicit.
    7206             : // Assumes 2-complement integer types and sign extension of right shifting
    7207             : // GCC guarantees such:
    7208             : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
    7209      124890 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    7210             : {
    7211      124890 :     return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
    7212      124890 :                               7);
    7213             : }
    7214             : #else
    7215             : // Portable way
    7216             : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    7217             : {
    7218             :     return (byVal & (1 << nBit)) ? 255 : 0;
    7219             : }
    7220             : #endif
    7221             : 
    7222             : /************************************************************************/
    7223             : /*                  ExpandEightPackedBitsToByteAt255()                  */
    7224             : /************************************************************************/
    7225             : 
    7226       15569 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
    7227             :                                                     GByte abyOutput[8])
    7228             : {
    7229       15569 :     abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
    7230       15569 :     abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
    7231       15569 :     abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
    7232       15569 :     abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
    7233       15569 :     abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
    7234       15569 :     abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
    7235       15569 :     abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
    7236       15569 :     abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
    7237       15569 : }
    7238             : 
    7239             : /************************************************************************/
    7240             : /*                 GDALExpandPackedBitsToByteAt0Or255()                 */
    7241             : /************************************************************************/
    7242             : 
    7243             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7244             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7245             :   at 1 to a byte at 255.
    7246             : 
    7247             :  The function does (in a possibly more optimized way) the following:
    7248             :  \code{.cpp}
    7249             :  for (size_t i = 0; i < nInputBits; ++i )
    7250             :  {
    7251             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
    7252             :  }
    7253             :  \endcode
    7254             : 
    7255             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7256             :  @param pabyOutput Output array of nInputBits bytes.
    7257             :  @param nInputBits Number of valid bits in pabyInput.
    7258             : 
    7259             :  @since 3.11
    7260             : */
    7261             : 
    7262       45145 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
    7263             :                                         GByte *CPL_RESTRICT pabyOutput,
    7264             :                                         size_t nInputBits)
    7265             : {
    7266       45145 :     const size_t nInputWholeBytes = nInputBits / 8;
    7267       45145 :     size_t iByte = 0;
    7268             : 
    7269             : #ifdef HAVE_SSE2
    7270             :     // Mask to isolate each bit
    7271       45145 :     const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
    7272             :                                           8, 16, 32, 64, -128);
    7273       45145 :     const __m128i zero = _mm_setzero_si128();
    7274       45145 :     const __m128i all_ones = _mm_set1_epi8(-1);
    7275             : #ifdef __SSSE3__
    7276             :     const __m128i dispatch_two_bytes =
    7277             :         _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
    7278             : #endif
    7279       45145 :     constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
    7280      135654 :     for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
    7281             :     {
    7282       90509 :         __m128i reg_ori = _mm_loadu_si128(
    7283       90509 :             reinterpret_cast<const __m128i *>(pabyInput + iByte));
    7284             : 
    7285       90509 :         constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
    7286      814581 :         for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
    7287             :         {
    7288             :             // Given reg_ori = (A, B, ... 14 other bytes ...),
    7289             :             // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
    7290             : #ifdef __SSSE3__
    7291             :             __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
    7292             : #else
    7293      724072 :             __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
    7294      724072 :             reg = _mm_unpacklo_epi16(reg, reg);
    7295      724072 :             reg = _mm_unpacklo_epi32(reg, reg);
    7296             : #endif
    7297             : 
    7298             :             // Test if bits of interest are set
    7299      724072 :             reg = _mm_and_si128(reg, bit_mask);
    7300             : 
    7301             :             // Now test if those bits are set, by comparing to zero. So the
    7302             :             // result will be that bytes where bits are set will be at 0, and
    7303             :             // ones where they are cleared will be at 0xFF. So the inverse of
    7304             :             // the end result we want!
    7305      724072 :             reg = _mm_cmpeq_epi8(reg, zero);
    7306             : 
    7307             :             // Invert the result
    7308      724072 :             reg = _mm_andnot_si128(reg, all_ones);
    7309             : 
    7310             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
    7311             : 
    7312      724072 :             pabyOutput += SSE_REG_SIZE;
    7313             : 
    7314             :             // Right-shift of 2 bytes
    7315      724072 :             reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
    7316             :         }
    7317             :     }
    7318             : 
    7319             : #endif  // HAVE_SSE2
    7320             : 
    7321       60714 :     for (; iByte < nInputWholeBytes; ++iByte)
    7322             :     {
    7323       15569 :         ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
    7324       15569 :         pabyOutput += 8;
    7325             :     }
    7326       45483 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7327             :     {
    7328         338 :         *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
    7329         338 :         ++pabyOutput;
    7330             :     }
    7331       45145 : }
    7332             : 
    7333             : /************************************************************************/
    7334             : /*                   ExpandEightPackedBitsToByteAt1()                   */
    7335             : /************************************************************************/
    7336             : 
    7337      136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
    7338             :                                                   GByte abyOutput[8])
    7339             : {
    7340      136113 :     abyOutput[0] = (byVal >> 7) & 0x1;
    7341      136113 :     abyOutput[1] = (byVal >> 6) & 0x1;
    7342      136113 :     abyOutput[2] = (byVal >> 5) & 0x1;
    7343      136113 :     abyOutput[3] = (byVal >> 4) & 0x1;
    7344      136113 :     abyOutput[4] = (byVal >> 3) & 0x1;
    7345      136113 :     abyOutput[5] = (byVal >> 2) & 0x1;
    7346      136113 :     abyOutput[6] = (byVal >> 1) & 0x1;
    7347      136113 :     abyOutput[7] = (byVal >> 0) & 0x1;
    7348      136113 : }
    7349             : 
    7350             : /************************************************************************/
    7351             : /*                  GDALExpandPackedBitsToByteAt0Or1()                  */
    7352             : /************************************************************************/
    7353             : 
    7354             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7355             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7356             :   at 1 to a byte at 1.
    7357             : 
    7358             :  The function does (in a possibly more optimized way) the following:
    7359             :  \code{.cpp}
    7360             :  for (size_t i = 0; i < nInputBits; ++i )
    7361             :  {
    7362             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
    7363             :  }
    7364             :  \endcode
    7365             : 
    7366             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7367             :  @param pabyOutput Output array of nInputBits bytes.
    7368             :  @param nInputBits Number of valid bits in pabyInput.
    7369             : 
    7370             :  @since 3.11
    7371             : */
    7372             : 
    7373        7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
    7374             :                                       GByte *CPL_RESTRICT pabyOutput,
    7375             :                                       size_t nInputBits)
    7376             : {
    7377        7033 :     const size_t nInputWholeBytes = nInputBits / 8;
    7378        7033 :     size_t iByte = 0;
    7379      143146 :     for (; iByte < nInputWholeBytes; ++iByte)
    7380             :     {
    7381      136113 :         ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
    7382      136113 :         pabyOutput += 8;
    7383             :     }
    7384       18886 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7385             :     {
    7386       11853 :         *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
    7387       11853 :         ++pabyOutput;
    7388             :     }
    7389        7033 : }

Generated by: LCOV version 1.14