LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2800 3052 91.7 %
Date: 2026-07-03 10:13:58 Functions: 691 731 94.5 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_float.h"
      36             : #include "cpl_progress.h"
      37             : #include "cpl_string.h"
      38             : #include "cpl_vsi.h"
      39             : #include "gdal_priv_templates.hpp"
      40             : #include "gdal_vrt.h"
      41             : #include "gdalwarper.h"
      42             : #include "memdataset.h"
      43             : #include "vrtdataset.h"
      44             : 
      45             : #if defined(__x86_64) || defined(_M_X64)
      46             : #include <emmintrin.h>
      47             : #include <immintrin.h>
      48             : #define HAVE_SSE2
      49             : // AVX2 dispatch: compile AVX2 code with target attribute, detect at runtime
      50             : #if (defined(__GNUC__) || defined(__clang__)) &&                               \
      51             :     defined(HAVE_AVX2_AT_COMPILE_TIME)
      52             : #define HAVE_AVX2_DISPATCH
      53             : #elif defined(_MSC_VER)
      54             : #include <intrin.h>
      55             : #define HAVE_AVX2_DISPATCH
      56             : #endif
      57             : #elif defined(USE_NEON_OPTIMIZATIONS)
      58             : #include "include_sse2neon.h"
      59             : #define HAVE_SSE2
      60             : #endif
      61             : 
      62             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      63             : #include "rasterio_ssse3.h"
      64             : #ifdef __SSSE3__
      65             : #include <tmmintrin.h>
      66             : #endif
      67             : #endif
      68             : 
      69             : #ifdef __SSE4_1__
      70             : #include <smmintrin.h>
      71             : #endif
      72             : 
      73             : #ifdef __GNUC__
      74             : #define CPL_NOINLINE __attribute__((noinline))
      75             : #else
      76             : #define CPL_NOINLINE
      77             : #endif
      78             : 
      79             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      80             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      81             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      82             : 
      83             : /************************************************************************/
      84             : /*                     DownsamplingIntegerXFactor()                     */
      85             : /************************************************************************/
      86             : 
      87             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      88      695860 : static bool DownsamplingIntegerXFactor(
      89             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      90             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      91             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      92             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      93             : {
      94      695860 :     const int nBandDataSize =
      95             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      96      695860 :     int nOuterLoopIters = nBufXSize - 1;
      97      695860 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      98             :     const GByte *CPL_RESTRICT pabySrcData;
      99      695860 :     int nEndBlockX = nBlockXSize + nStartBlockX;
     100             : 
     101      695860 :     if (iSrcX < nEndBlockX)
     102             :     {
     103      295071 :         CPLAssert(poBlock);
     104      295071 :         goto no_reload_block;
     105             :     }
     106      400789 :     goto reload_block;
     107             : 
     108             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
     109             :     // nRasterXSize - 1
     110     1265133 :     while (--nOuterLoopIters >= 1)
     111             :     {
     112      201834 :         iSrcX += nSrcXInc;
     113      201834 :         pabySrcData += nIncSrcOffset;
     114      201834 :         pabyDstData += nPixelSpace;
     115             : 
     116             :         /* --------------------------------------------------------------------
     117             :          */
     118             :         /*      Ensure we have the appropriate block loaded. */
     119             :         /* --------------------------------------------------------------------
     120             :          */
     121      201834 :         if (iSrcX >= nEndBlockX)
     122             :         {
     123      201834 :         reload_block:
     124             :         {
     125      615213 :             const int nLBlockX = iSrcX / nBlockXSize;
     126      615213 :             nStartBlockX = nLBlockX * nBlockXSize;
     127      615213 :             nEndBlockX = nStartBlockX + nBlockXSize;
     128             : 
     129      615213 :             if (poBlock != nullptr)
     130      341376 :                 poBlock->DropLock();
     131             : 
     132      615213 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     133      615213 :             if (poBlock == nullptr)
     134             :             {
     135           1 :                 return false;
     136             :             }
     137             :         }
     138             : 
     139      615212 :         no_reload_block:
     140             :             const GByte *pabySrcBlock =
     141     1265133 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     142     1265133 :             GPtrDiff_t iSrcOffset =
     143     1265133 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     144     1265133 :             pabySrcData = pabySrcBlock + iSrcOffset;
     145             :         }
     146             : 
     147             :         /* --------------------------------------------------------------------
     148             :          */
     149             :         /*      Copy the maximum run of pixels. */
     150             :         /* --------------------------------------------------------------------
     151             :          */
     152             : 
     153     1265133 :         const int nIters = std::min(
     154     1265133 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     155             :         if (bSameDataType)
     156             :         {
     157     1264690 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     158     1264690 :             if (nIters > 1)
     159             :             {
     160             :                 if (DATA_TYPE_SIZE == 1)
     161             :                 {
     162      326330 :                     pabySrcData += nIncSrcOffset;
     163      326330 :                     pabyDstData += nPixelSpace;
     164      326330 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     165      326330 :                                      nPixelSpace, nIters - 1);
     166      326330 :                     pabySrcData +=
     167      326330 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     168      326330 :                     pabyDstData +=
     169      326330 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     170             :                 }
     171             :                 else
     172             :                 {
     173     4395716 :                     for (int i = 0; i < nIters - 1; i++)
     174             :                     {
     175     4197550 :                         pabySrcData += nIncSrcOffset;
     176     4197550 :                         pabyDstData += nPixelSpace;
     177     4197550 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     178             :                     }
     179             :                 }
     180      524500 :                 iSrcX += nSrcXInc * (nIters - 1);
     181      524500 :                 nOuterLoopIters -= nIters - 1;
     182             :             }
     183             :         }
     184             :         else
     185             :         {
     186             :             // Type to type conversion ...
     187         443 :             GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     188         443 :                             eBufType, nPixelSpace, std::max(1, nIters));
     189         443 :             if (nIters > 1)
     190             :             {
     191         216 :                 pabySrcData +=
     192         216 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     193         216 :                 pabyDstData +=
     194         216 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     195         216 :                 iSrcX += nSrcXInc * (nIters - 1);
     196         216 :                 nOuterLoopIters -= nIters - 1;
     197             :             }
     198             :         }
     199             :     }
     200             : 
     201             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     202     1063299 :     if (nOuterLoopIters == 0)
     203             :     {
     204      367440 :         const int nRasterXSize = poBand->GetXSize();
     205      367440 :         iSrcX =
     206      734880 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     207      367440 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     208      367440 :         pabyDstData += nPixelSpace;
     209      367440 :         if (iSrcX < nEndBlockX)
     210             :         {
     211      354850 :             goto no_reload_block;
     212             :         }
     213       12590 :         goto reload_block;
     214             :     }
     215      695859 :     return true;
     216             : }
     217             : 
     218             : template <class A, class B>
     219     2837680 : CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW inline auto CPLUnsanitizedMul(A a, B b)
     220             : {
     221     2837680 :     return a * b;
     222             : }
     223             : 
     224             : /************************************************************************/
     225             : /*                             IRasterIO()                              */
     226             : /*                                                                      */
     227             : /*      Default internal implementation of RasterIO() ... utilizes      */
     228             : /*      the Block access methods to satisfy the request.  This would    */
     229             : /*      normally only be overridden by formats with overviews.          */
     230             : /************************************************************************/
     231             : 
     232     6195820 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     233             :                                  int nXSize, int nYSize, void *pData,
     234             :                                  int nBufXSize, int nBufYSize,
     235             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     236             :                                  GSpacing nLineSpace,
     237             :                                  GDALRasterIOExtraArg *psExtraArg)
     238             : 
     239             : {
     240     6195820 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     241             :     {
     242           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     243             :                  "An error occurred while writing a dirty block "
     244             :                  "from GDALRasterBand::IRasterIO");
     245           0 :         CPLErr eErr = eFlushBlockErr;
     246           0 :         eFlushBlockErr = CE_None;
     247           0 :         return eErr;
     248             :     }
     249     6195820 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     250             :     {
     251           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     252           0 :         return CE_Failure;
     253             :     }
     254             : 
     255     6195820 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     256     6195820 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     257     6195820 :     GByte dummyBlock[2] = {0, 0};
     258     6195820 :     GByte *pabySrcBlock =
     259             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     260     6195820 :     GDALRasterBlock *poBlock = nullptr;
     261     6195820 :     const bool bUseIntegerRequestCoords =
     262     6562500 :         (!psExtraArg->bFloatingPointWindowValidity ||
     263      366678 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     264      341746 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     265             : 
     266             :     /* ==================================================================== */
     267             :     /*      A common case is the data requested with the destination        */
     268             :     /*      is packed, and the block width is the raster width.             */
     269             :     /* ==================================================================== */
     270     6102460 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     271     3241510 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     272    12298300 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     273             :     {
     274     3100360 :         CPLErr eErr = CE_None;
     275     3100360 :         int nLBlockY = -1;
     276             : 
     277     9807210 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     278             :         {
     279     6707940 :             const int iSrcY = iBufYOff + nYOff;
     280             : 
     281     6707940 :             if (iSrcY < nLBlockY * nBlockYSize ||
     282     6707940 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     283             :             {
     284     3369770 :                 nLBlockY = iSrcY / nBlockYSize;
     285     3369770 :                 bool bJustInitialize =
     286      298230 :                     eRWFlag == GF_Write && nXOff == 0 &&
     287     3726110 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     288       58115 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     289             : 
     290             :                 // Is this a partial tile at right and/or bottom edges of
     291             :                 // the raster, and that is going to be completely written?
     292             :                 // If so, do not load it from storage, but zero it so that
     293             :                 // the content outsize of the validity area is initialized.
     294     3369770 :                 bool bMemZeroBuffer = false;
     295      298230 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     296       25683 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     297     3668090 :                     nYOff + nYSize == GetYSize() &&
     298          90 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     299             :                 {
     300          90 :                     bJustInitialize = true;
     301          90 :                     bMemZeroBuffer = true;
     302             :                 }
     303             : 
     304     3369770 :                 if (poBlock)
     305      269410 :                     poBlock->DropLock();
     306             : 
     307     3369770 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     308     3369770 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     309     3369770 :                 if (poBlock == nullptr)
     310             :                 {
     311        1079 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     312             :                         nullptr)
     313             :                     {
     314           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     315             :                                  "GetBlockRef failed at X block offset %d, "
     316             :                                  "Y block offset %d%s",
     317             :                                  0, nLBlockY,
     318           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     319           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     320             :                                      : "");
     321             :                     }
     322        1079 :                     eErr = CE_Failure;
     323        1079 :                     break;
     324             :                 }
     325             : 
     326     3368690 :                 if (eRWFlag == GF_Write)
     327      298230 :                     poBlock->MarkDirty();
     328             : 
     329     3368690 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     330     3368690 :                 if (bMemZeroBuffer)
     331             :                 {
     332          90 :                     memset(pabySrcBlock, 0,
     333          90 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     334          90 :                                nBlockXSize * nBlockYSize);
     335             :                 }
     336             :             }
     337             : 
     338     6706860 :             const auto nSrcByteOffset =
     339     6706860 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     340     6706860 :                      nBlockXSize +
     341     6706860 :                  nXOff) *
     342     6706860 :                 nBandDataSize;
     343             : 
     344     6706860 :             if (eDataType == eBufType)
     345             :             {
     346     3041780 :                 if (eRWFlag == GF_Read)
     347     2566430 :                     memcpy(static_cast<GByte *>(pData) +
     348     2566430 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     349     2566430 :                            pabySrcBlock + nSrcByteOffset,
     350             :                            static_cast<size_t>(nLineSpace));
     351             :                 else
     352      475352 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     353      475352 :                            static_cast<GByte *>(pData) +
     354      475352 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     355             :                            static_cast<size_t>(nLineSpace));
     356             :             }
     357             :             else
     358             :             {
     359             :                 // Type to type conversion.
     360     3665080 :                 if (eRWFlag == GF_Read)
     361     3642780 :                     GDALCopyWords64(
     362     3642780 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     363             :                         static_cast<GByte *>(pData) +
     364     3642780 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     365             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     366             :                 else
     367       22299 :                     GDALCopyWords64(static_cast<GByte *>(pData) +
     368       22299 :                                         static_cast<GPtrDiff_t>(iBufYOff) *
     369             :                                             nLineSpace,
     370             :                                     eBufType, static_cast<int>(nPixelSpace),
     371       22299 :                                     pabySrcBlock + nSrcByteOffset, eDataType,
     372             :                                     nBandDataSize, nBufXSize);
     373             :             }
     374             : 
     375     6795020 :             if (psExtraArg->pfnProgress != nullptr &&
     376       88164 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     377             :                                          psExtraArg->pProgressData))
     378             :             {
     379           5 :                 eErr = CE_Failure;
     380           5 :                 break;
     381             :             }
     382             :         }
     383             : 
     384     3100360 :         if (poBlock)
     385     3099280 :             poBlock->DropLock();
     386             : 
     387     3100360 :         return eErr;
     388             :     }
     389             : 
     390             :     /* ==================================================================== */
     391             :     /*      Do we have overviews that would be appropriate to satisfy       */
     392             :     /*      this request?                                                   */
     393             :     /* ==================================================================== */
     394     3095470 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     395             :         eRWFlag == GF_Read)
     396             :     {
     397             :         GDALRasterIOExtraArg sExtraArg;
     398        2967 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     399             : 
     400             :         const int nOverview =
     401        2967 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     402             :                                           nBufXSize, nBufYSize, &sExtraArg);
     403        2967 :         if (nOverview >= 0)
     404             :         {
     405        2892 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     406        2892 :             if (poOverviewBand == nullptr)
     407        2892 :                 return CE_Failure;
     408             : 
     409        2892 :             return poOverviewBand->RasterIO(
     410             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     411        2892 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     412             :         }
     413             :     }
     414             : 
     415      902985 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     416           6 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     417     3995570 :         nLineSpace == nPixelSpace * nBufXSize &&
     418           6 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     419             :     {
     420           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     421           0 :         return CE_None;
     422             :     }
     423             : 
     424             :     /* ==================================================================== */
     425             :     /*      The second case when we don't need subsample data but likely    */
     426             :     /*      need data type conversion.                                      */
     427             :     /* ==================================================================== */
     428     3092580 :     if (  // nPixelSpace == nBufDataSize &&
     429     3092580 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     430             :     {
     431             : #if DEBUG_VERBOSE
     432             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     433             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     434             : #endif
     435             : 
     436             :         /* --------------------------------------------------------------------
     437             :          */
     438             :         /*      Loop over buffer computing source locations. */
     439             :         /* --------------------------------------------------------------------
     440             :          */
     441             :         // Calculate starting values out of loop
     442     2513120 :         const int nLBlockXStart = nXOff / nBlockXSize;
     443     2513120 :         const int nXSpanEnd = nBufXSize + nXOff;
     444             : 
     445     2513120 :         int iBufYOff = 0;
     446     2513120 :         int iSrcY = nYOff;
     447             :         while (true)
     448             :         {
     449     2554230 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     450             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     451     2554230 :             int nLBlockY = iSrcY / nBlockYSize;
     452     2554230 :             int nLBlockX = nLBlockXStart;
     453     2554230 :             int iSrcX = nXOff;
     454     5391830 :             while (iSrcX < nXSpanEnd)
     455             :             {
     456     2837680 :                 int nXSpan = nLBlockX * nBlockXSize;
     457     2837680 :                 if (nXSpan < INT_MAX - nBlockXSize)
     458     2837680 :                     nXSpan += nBlockXSize;
     459             :                 else
     460           0 :                     nXSpan = INT_MAX;
     461     2837680 :                 const int nXRight = nXSpan;
     462     2837680 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     463             : 
     464             :                 const size_t nXSpanSize =
     465     2837680 :                     CPLUnsanitizedMul(nXSpan, static_cast<size_t>(nPixelSpace));
     466             : 
     467     2837680 :                 bool bJustInitialize =
     468     2043130 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     469       38155 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     470     4907220 :                     nXOff <= nLBlockX * nBlockXSize &&
     471       26418 :                     nXOff + nXSize >= nXRight;
     472             : 
     473             :                 // Is this a partial tile at right and/or bottom edges of
     474             :                 // the raster, and that is going to be completely written?
     475             :                 // If so, do not load it from storage, but zero it so that
     476             :                 // the content outsize of the validity area is initialized.
     477     2837680 :                 bool bMemZeroBuffer = false;
     478     2043130 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     479     2017970 :                     nXOff <= nLBlockX * nBlockXSize &&
     480     2016310 :                     nYOff <= nLBlockY * nBlockYSize &&
     481       12227 :                     (nXOff + nXSize >= nXRight ||
     482             :                      // cppcheck-suppress knownConditionTrueFalse
     483     4883590 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     484       12047 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     485       10809 :                      (nYOff + nYSize == GetYSize() &&
     486        2015 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     487             :                 {
     488        3253 :                     bJustInitialize = true;
     489        3253 :                     bMemZeroBuffer = true;
     490             :                 }
     491             : 
     492             :                 /* --------------------------------------------------------------------
     493             :                  */
     494             :                 /*      Ensure we have the appropriate block loaded. */
     495             :                 /* --------------------------------------------------------------------
     496             :                  */
     497     2837680 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     498     2837680 :                 poBlock =
     499     2837680 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     500     2837680 :                 if (!poBlock)
     501             :                 {
     502          73 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     503             :                         nullptr)
     504             :                     {
     505           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     506             :                                  "GetBlockRef failed at X block offset %d, "
     507             :                                  "Y block offset %d%s",
     508             :                                  nLBlockX, nLBlockY,
     509           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     510           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     511             :                                      : "");
     512             :                     }
     513          73 :                     return (CE_Failure);
     514             :                 }
     515             : 
     516     2837600 :                 if (eRWFlag == GF_Write)
     517     2043130 :                     poBlock->MarkDirty();
     518             : 
     519     2837600 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     520     2837600 :                 if (bMemZeroBuffer)
     521             :                 {
     522        3253 :                     memset(pabySrcBlock, 0,
     523        3253 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     524        3253 :                                nBlockXSize * nBlockYSize);
     525             :                 }
     526             :                 /* --------------------------------------------------------------------
     527             :                  */
     528             :                 /*      Copy over this chunk of data. */
     529             :                 /* --------------------------------------------------------------------
     530             :                  */
     531     2837600 :                 GPtrDiff_t iSrcOffset =
     532     2837600 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     533     2837600 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     534     2837600 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     535     2837600 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     536     2837600 :                          nBlockXSize) *
     537     2837600 :                     nBandDataSize;
     538             :                 // Fill up as many rows as possible for the loaded block.
     539     5675200 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     540     2837600 :                                           nBufYSize - iBufYOff);
     541    62418100 :                 for (int k = 0; k < kmax; k++)
     542             :                 {
     543    59580500 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     544             :                     {
     545    55176100 :                         if (eRWFlag == GF_Read)
     546    50723100 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     547    50723100 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     548    50723100 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     549             :                         else
     550     4453010 :                             memcpy(pabySrcBlock + iSrcOffset,
     551     4453010 :                                    static_cast<GByte *>(pData) + iBufOffset +
     552     4453010 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     553             :                                    nXSpanSize);
     554             :                     }
     555             :                     else
     556             :                     {
     557             :                         /* type to type conversion */
     558     4404370 :                         if (eRWFlag == GF_Read)
     559     4254160 :                             GDALCopyWords64(
     560     4254160 :                                 pabySrcBlock + iSrcOffset, eDataType,
     561             :                                 nBandDataSize,
     562     4254160 :                                 static_cast<GByte *>(pData) + iBufOffset +
     563     4254160 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     564             :                                 eBufType, static_cast<int>(nPixelSpace),
     565             :                                 nXSpan);
     566             :                         else
     567      150209 :                             GDALCopyWords64(
     568      150209 :                                 static_cast<GByte *>(pData) + iBufOffset +
     569      150209 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     570             :                                 eBufType, static_cast<int>(nPixelSpace),
     571      150209 :                                 pabySrcBlock + iSrcOffset, eDataType,
     572             :                                 nBandDataSize, nXSpan);
     573             :                     }
     574             : 
     575    59580500 :                     iSrcOffset +=
     576    59580500 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     577             :                 }
     578             : 
     579             :                 iBufOffset =
     580     2837600 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     581     2837600 :                 nLBlockX++;
     582     2837600 :                 iSrcX += nXSpan;
     583             : 
     584     2837600 :                 poBlock->DropLock();
     585     2837600 :                 poBlock = nullptr;
     586             :             }
     587             : 
     588             :             /* Compute the increment to go on a block boundary */
     589     2554160 :             const int nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     590             : 
     591     2556050 :             if (psExtraArg->pfnProgress != nullptr &&
     592        1889 :                 !psExtraArg->pfnProgress(
     593     2556050 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     594             :                     psExtraArg->pProgressData))
     595             :             {
     596           0 :                 return CE_Failure;
     597             :             }
     598             : 
     599     2554160 :             iBufYOff += nYInc;
     600     2554160 :             if (iBufYOff >= nBufYSize)
     601     2513050 :                 break;
     602             :             // Only increment iSrcY after above loop end check, to avoid
     603             :             // potential int overflow.
     604       41106 :             iSrcY += nYInc;
     605       41106 :         }
     606             : 
     607     2513050 :         return CE_None;
     608             :     }
     609             : 
     610             :     /* ==================================================================== */
     611             :     /*      Loop reading required source blocks to satisfy output           */
     612             :     /*      request.  This is the most general implementation.              */
     613             :     /* ==================================================================== */
     614             : 
     615      579453 :     double dfXOff = nXOff;
     616      579453 :     double dfYOff = nYOff;
     617      579453 :     double dfXSize = nXSize;
     618      579453 :     double dfYSize = nYSize;
     619      579453 :     if (psExtraArg->bFloatingPointWindowValidity)
     620             :     {
     621      244495 :         dfXOff = psExtraArg->dfXOff;
     622      244495 :         dfYOff = psExtraArg->dfYOff;
     623      244495 :         dfXSize = psExtraArg->dfXSize;
     624      244495 :         dfYSize = psExtraArg->dfYSize;
     625             :     }
     626             : 
     627             :     /* -------------------------------------------------------------------- */
     628             :     /*      Compute stepping increment.                                     */
     629             :     /* -------------------------------------------------------------------- */
     630      579453 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     631      579453 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     632      579453 :     CPLErr eErr = CE_None;
     633             : 
     634      579453 :     if (eRWFlag == GF_Write)
     635             :     {
     636             :         /* --------------------------------------------------------------------
     637             :          */
     638             :         /*    Write case */
     639             :         /*    Loop over raster window computing source locations in the buffer.
     640             :          */
     641             :         /* --------------------------------------------------------------------
     642             :          */
     643      166655 :         GByte *pabyDstBlock = nullptr;
     644      166655 :         int nLBlockX = -1;
     645      166655 :         int nLBlockY = -1;
     646             : 
     647     1260010 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     648             :         {
     649     1093360 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     650             : 
     651    12384200 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     652             :             {
     653    11290800 :                 const int iBufXOff =
     654    11290800 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     655    11290800 :                 GPtrDiff_t iBufOffset =
     656    11290800 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     657             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     658    11290800 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     659             : 
     660             :                 // FIXME: this code likely doesn't work if the dirty block gets
     661             :                 // flushed to disk before being completely written.
     662             :                 // In the meantime, bJustInitialize should probably be set to
     663             :                 // FALSE even if it is not ideal performance wise, and for
     664             :                 // lossy compression.
     665             : 
     666             :                 /* --------------------------------------------------------------------
     667             :                  */
     668             :                 /*      Ensure we have the appropriate block loaded. */
     669             :                 /* --------------------------------------------------------------------
     670             :                  */
     671    11290800 :                 if (iDstX < nLBlockX * nBlockXSize ||
     672    11041500 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     673    10584800 :                     iDstY < nLBlockY * nBlockYSize ||
     674    10584800 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     675             :                 {
     676      738702 :                     nLBlockX = iDstX / nBlockXSize;
     677      738702 :                     nLBlockY = iDstY / nBlockYSize;
     678             : 
     679      738702 :                     const bool bJustInitialize =
     680     1065990 :                         nYOff <= nLBlockY * nBlockYSize &&
     681      327291 :                         nYOff + nYSize - nBlockYSize >=
     682      327291 :                             nLBlockY * nBlockYSize &&
     683     1116320 :                         nXOff <= nLBlockX * nBlockXSize &&
     684       50325 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     685             :                     /*bool bMemZeroBuffer = FALSE;
     686             :                     if( !bJustInitialize &&
     687             :                         nXOff <= nLBlockX * nBlockXSize &&
     688             :                         nYOff <= nLBlockY * nBlockYSize &&
     689             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     690             :                          (nXOff + nXSize == GetXSize() &&
     691             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     692             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     693             :                          (nYOff + nYSize == GetYSize() &&
     694             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     695             :                     {
     696             :                         bJustInitialize = TRUE;
     697             :                         bMemZeroBuffer = TRUE;
     698             :                     }*/
     699      738702 :                     if (poBlock != nullptr)
     700      572047 :                         poBlock->DropLock();
     701             : 
     702      738702 :                     poBlock =
     703      738702 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     704      738702 :                     if (poBlock == nullptr)
     705             :                     {
     706           0 :                         return (CE_Failure);
     707             :                     }
     708             : 
     709      738702 :                     poBlock->MarkDirty();
     710             : 
     711      738702 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     712             :                     /*if( bMemZeroBuffer )
     713             :                     {
     714             :                         memset(pabyDstBlock, 0,
     715             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     716             :                     * nBlockYSize);
     717             :                     }*/
     718             :                 }
     719             : 
     720             :                 // To make Coverity happy. Should not happen by design.
     721    11290800 :                 if (pabyDstBlock == nullptr)
     722             :                 {
     723           0 :                     CPLAssert(false);
     724             :                     eErr = CE_Failure;
     725             :                     break;
     726             :                 }
     727             : 
     728             :                 /* --------------------------------------------------------------------
     729             :                  */
     730             :                 /*      Copy over this pixel of data. */
     731             :                 /* --------------------------------------------------------------------
     732             :                  */
     733    11290800 :                 GPtrDiff_t iDstOffset =
     734    11290800 :                     (static_cast<GPtrDiff_t>(iDstX) -
     735    11290800 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     736    11290800 :                      (static_cast<GPtrDiff_t>(iDstY) -
     737    11290800 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     738    11290800 :                          nBlockXSize) *
     739    11290800 :                     nBandDataSize;
     740             : 
     741    11290800 :                 if (eDataType == eBufType)
     742             :                 {
     743    11287700 :                     memcpy(pabyDstBlock + iDstOffset,
     744    11287700 :                            static_cast<GByte *>(pData) + iBufOffset,
     745             :                            nBandDataSize);
     746             :                 }
     747             :                 else
     748             :                 {
     749             :                     /* type to type conversion ... ouch, this is expensive way
     750             :                     of handling single words */
     751        3096 :                     GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
     752        3096 :                                     eBufType, 0, pabyDstBlock + iDstOffset,
     753             :                                     eDataType, 0, 1);
     754             :                 }
     755             :             }
     756             : 
     757     1093360 :             if (psExtraArg->pfnProgress != nullptr &&
     758           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     759             :                                          psExtraArg->pProgressData))
     760             :             {
     761           0 :                 eErr = CE_Failure;
     762           0 :                 break;
     763             :             }
     764             :         }
     765             :     }
     766             :     else
     767             :     {
     768      412798 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     769             :         {
     770       46692 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     771       15098 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     772       15045 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     773       31641 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     774        4763 :                 GetColorTable() != nullptr)
     775             :             {
     776           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     777             :                          "Resampling method not supported on paletted band. "
     778             :                          "Falling back to nearest neighbour");
     779             :             }
     780       15800 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     781           3 :                      GDALDataTypeIsComplex(eDataType))
     782             :             {
     783           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     784             :                          "Resampling method not supported on complex data type "
     785             :                          "band. Falling back to nearest neighbour");
     786             :             }
     787             :             else
     788             :             {
     789       15797 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     790             :                                          pData, nBufXSize, nBufYSize, eBufType,
     791       15797 :                                          nPixelSpace, nLineSpace, psExtraArg);
     792             :             }
     793             :         }
     794             : 
     795      397001 :         int nLimitBlockY = 0;
     796      397001 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     797      397001 :         int nStartBlockX = -nBlockXSize;
     798      397001 :         constexpr double EPS = 1e-10;
     799      397001 :         int nLBlockY = -1;
     800      397001 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     801      397001 :         const bool bIntegerXFactor =
     802      372768 :             bUseIntegerRequestCoords &&
     803      670838 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     804      273837 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     805             : 
     806             :         /* --------------------------------------------------------------------
     807             :          */
     808             :         /*      Read case */
     809             :         /*      Loop over buffer computing source locations. */
     810             :         /* --------------------------------------------------------------------
     811             :          */
     812     2367110 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     813             :         {
     814             :             // Add small epsilon to avoid some numeric precision issues.
     815     1970120 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     816     1970120 :             const int iSrcY = static_cast<int>(std::min(
     817     1970120 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     818             : 
     819     1970120 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     820             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     821             : 
     822     1970120 :             if (iSrcY >= nLimitBlockY)
     823             :             {
     824      438019 :                 nLBlockY = iSrcY / nBlockYSize;
     825      438019 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     826      438019 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     827      438019 :                     nLimitBlockY += nBlockYSize;
     828             :                 else
     829           0 :                     nLimitBlockY = INT_MAX;
     830             :                 // Make sure a new block is loaded.
     831      438019 :                 nStartBlockX = -nBlockXSize;
     832             :             }
     833     1532100 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     834             :             {
     835             :                 // Make sure a new block is loaded.
     836      437363 :                 nStartBlockX = -nBlockXSize;
     837             :             }
     838             : 
     839     1970120 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     840     1970120 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     841             : 
     842     1970120 :             if (bIntegerXFactor)
     843             :             {
     844      695860 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     845      695860 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     846      695860 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     847      695860 :                 bool bRet = false;
     848      695860 :                 if (bByteCopy)
     849             :                 {
     850      585852 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     851             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     852             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_UInt8,
     853             :                         GDT_UInt8, nStartBlockX, nBlockXSize, poBlock,
     854             :                         nLBlockY);
     855             :                 }
     856      110008 :                 else if (eDataType == eBufType)
     857             :                 {
     858      109783 :                     switch (nBandDataSize)
     859             :                     {
     860      109630 :                         case 2:
     861      109630 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     862             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     863             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     864             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     865             :                                 nBlockXSize, poBlock, nLBlockY);
     866      109630 :                             break;
     867          55 :                         case 4:
     868          55 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     869             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     870             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     871             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     872             :                                 nBlockXSize, poBlock, nLBlockY);
     873          55 :                             break;
     874          96 :                         case 8:
     875          96 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     876             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     877             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     878             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     879             :                                 nBlockXSize, poBlock, nLBlockY);
     880          96 :                             break;
     881           2 :                         case 16:
     882           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     883             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     884             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     885             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     886             :                                 nBlockXSize, poBlock, nLBlockY);
     887           2 :                             break;
     888           0 :                         default:
     889           0 :                             CPLAssert(false);
     890             :                             break;
     891             :                     }
     892             :                 }
     893             :                 else
     894             :                 {
     895         225 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     896             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     897             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     898             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     899             :                 }
     900      695860 :                 if (!bRet)
     901           1 :                     eErr = CE_Failure;
     902             :             }
     903             :             else
     904             :             {
     905     1274260 :                 double dfSrcX = dfSrcXStart;
     906   503811000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     907   502537000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     908             :                 {
     909             :                     // TODO?: try to avoid the clamping for most iterations
     910             :                     const int iSrcX = static_cast<int>(
     911  1005070000 :                         std::min(std::max(0.0, dfSrcX),
     912   502537000 :                                  static_cast<double>(nRasterXSize - 1)));
     913             : 
     914             :                     /* --------------------------------------------------------------------
     915             :                      */
     916             :                     /*      Ensure we have the appropriate block loaded. */
     917             :                     /* --------------------------------------------------------------------
     918             :                      */
     919   502537000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     920             :                     {
     921     1697820 :                         const int nLBlockX = iSrcX / nBlockXSize;
     922     1697820 :                         nStartBlockX = nLBlockX * nBlockXSize;
     923             : 
     924     1697820 :                         if (poBlock != nullptr)
     925     1574650 :                             poBlock->DropLock();
     926             : 
     927     1697820 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     928     1697820 :                         if (poBlock == nullptr)
     929             :                         {
     930           9 :                             eErr = CE_Failure;
     931           9 :                             break;
     932             :                         }
     933             : 
     934             :                         pabySrcBlock =
     935     1697810 :                             static_cast<GByte *>(poBlock->GetDataRef());
     936             :                     }
     937   502537000 :                     const GPtrDiff_t nDiffX =
     938   502537000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     939             : 
     940             :                     /* --------------------------------------------------------------------
     941             :                      */
     942             :                     /*      Copy over this pixel of data. */
     943             :                     /* --------------------------------------------------------------------
     944             :                      */
     945             : 
     946   502537000 :                     if (bByteCopy)
     947             :                     {
     948   442592000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     949   442592000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     950   442592000 :                             pabySrcBlock[iSrcOffset];
     951             :                     }
     952    59944700 :                     else if (eDataType == eBufType)
     953             :                     {
     954    50322800 :                         GPtrDiff_t iSrcOffset =
     955    50322800 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     956    50322800 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     957    50322800 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     958             :                     }
     959             :                     else
     960             :                     {
     961             :                         // Type to type conversion ...
     962     9621890 :                         GPtrDiff_t iSrcOffset =
     963     9621890 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     964     9621890 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
     965             :                                         static_cast<GByte *>(pData) +
     966     9621890 :                                             iBufOffset,
     967             :                                         eBufType, 0, 1);
     968             :                     }
     969             : 
     970   502537000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     971             :                 }
     972             :             }
     973     1970120 :             if (eErr == CE_Failure)
     974          11 :                 break;
     975             : 
     976     2191540 :             if (psExtraArg->pfnProgress != nullptr &&
     977      221434 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     978             :                                          psExtraArg->pProgressData))
     979             :             {
     980           1 :                 eErr = CE_Failure;
     981           1 :                 break;
     982             :             }
     983             :         }
     984             :     }
     985             : 
     986      563656 :     if (poBlock != nullptr)
     987      563646 :         poBlock->DropLock();
     988             : 
     989      563656 :     return eErr;
     990             : }
     991             : 
     992             : /************************************************************************/
     993             : /*                      GDALRasterIOTransformer()                       */
     994             : /************************************************************************/
     995             : 
     996             : struct GDALRasterIOTransformerStruct
     997             : {
     998             :     double dfXOff;
     999             :     double dfYOff;
    1000             :     double dfXRatioDstToSrc;
    1001             :     double dfYRatioDstToSrc;
    1002             : };
    1003             : 
    1004        6897 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
    1005             :                                    int nPointCount, double *x, double *y,
    1006             :                                    double * /* z */, int *panSuccess)
    1007             : {
    1008        6897 :     GDALRasterIOTransformerStruct *psParams =
    1009             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
    1010        6897 :     if (bDstToSrc)
    1011             :     {
    1012      311993 :         for (int i = 0; i < nPointCount; i++)
    1013             :         {
    1014      305684 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
    1015      305684 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
    1016      305684 :             panSuccess[i] = TRUE;
    1017             :         }
    1018             :     }
    1019             :     else
    1020             :     {
    1021        1176 :         for (int i = 0; i < nPointCount; i++)
    1022             :         {
    1023         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
    1024         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
    1025         588 :             panSuccess[i] = TRUE;
    1026             :         }
    1027             :     }
    1028        6897 :     return TRUE;
    1029             : }
    1030             : 
    1031             : /************************************************************************/
    1032             : /*                         RasterIOResampled()                          */
    1033             : /************************************************************************/
    1034             : 
    1035             : //! @cond Doxygen_Suppress
    1036       15797 : CPLErr GDALRasterBand::RasterIOResampled(
    1037             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1038             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1039             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1040             : {
    1041             :     // Determine if we use warping resampling or overview resampling
    1042             :     const bool bUseWarp =
    1043       15797 :         (GDALDataTypeIsComplex(eDataType) &&
    1044       15956 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1045         159 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1046             : 
    1047       15797 :     double dfXOff = nXOff;
    1048       15797 :     double dfYOff = nYOff;
    1049       15797 :     double dfXSize = nXSize;
    1050       15797 :     double dfYSize = nYSize;
    1051       15797 :     if (psExtraArg->bFloatingPointWindowValidity)
    1052             :     {
    1053       15051 :         dfXOff = psExtraArg->dfXOff;
    1054       15051 :         dfYOff = psExtraArg->dfYOff;
    1055       15051 :         dfXSize = psExtraArg->dfXSize;
    1056       15051 :         dfYSize = psExtraArg->dfYSize;
    1057             :     }
    1058             : 
    1059       15797 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1060       15797 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1061             : 
    1062             :     // Determine the coordinates in the "virtual" output raster to see
    1063             :     // if there are not integers, in which case we will use them as a shift
    1064             :     // so that subwindow extracts give the exact same results as entire raster
    1065             :     // scaling.
    1066       15797 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1067       15797 :     bool bHasXOffVirtual = false;
    1068       15797 :     int nDestXOffVirtual = 0;
    1069       15797 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1070             :     {
    1071       15469 :         bHasXOffVirtual = true;
    1072       15469 :         dfXOff = nXOff;
    1073       15469 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1074             :     }
    1075             : 
    1076       15797 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1077       15797 :     bool bHasYOffVirtual = false;
    1078       15797 :     int nDestYOffVirtual = 0;
    1079       15797 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1080             :     {
    1081       15465 :         bHasYOffVirtual = true;
    1082       15465 :         dfYOff = nYOff;
    1083       15465 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1084             :     }
    1085             : 
    1086             :     // Create a MEM dataset that wraps the output buffer.
    1087             :     GDALDataset *poMEMDS;
    1088       15797 :     void *pTempBuffer = nullptr;
    1089       15797 :     GSpacing nPSMem = nPixelSpace;
    1090       15797 :     GSpacing nLSMem = nLineSpace;
    1091       15797 :     void *pDataMem = pData;
    1092       15797 :     GDALDataType eDTMem = eBufType;
    1093       15797 :     if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
    1094             :     {
    1095           4 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1096           4 :         nLSMem = nPSMem * nBufXSize;
    1097             :         pTempBuffer =
    1098           4 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1099           4 :         if (pTempBuffer == nullptr)
    1100           0 :             return CE_Failure;
    1101           4 :         pDataMem = pTempBuffer;
    1102           4 :         eDTMem = eDataType;
    1103             :     }
    1104             : 
    1105             :     poMEMDS =
    1106       15797 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1107             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1108       15797 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1109       15797 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1110       15797 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1111             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1112       15797 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1113             : 
    1114             :     const char *pszNBITS =
    1115       15797 :         GetMetadataItem(GDALMD_NBITS, GDAL_MDD_IMAGE_STRUCTURE);
    1116       15797 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1117       15797 :     if (pszNBITS)
    1118           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1119           6 :             GDALMD_NBITS, pszNBITS, GDAL_MDD_IMAGE_STRUCTURE);
    1120             : 
    1121       15797 :     CPLErr eErr = CE_None;
    1122             : 
    1123             :     // Do the resampling.
    1124       15797 :     if (bUseWarp)
    1125             :     {
    1126         149 :         int bHasNoData = FALSE;
    1127         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1128             : 
    1129         149 :         VRTDatasetH hVRTDS = nullptr;
    1130         149 :         GDALRasterBandH hVRTBand = nullptr;
    1131         149 :         if (GetDataset() == nullptr)
    1132             :         {
    1133             :             /* Create VRT dataset that wraps the whole dataset */
    1134           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1135           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1136           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1137           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1138             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1139             :                                VRT_NODATA_UNSET);
    1140             : 
    1141             :             /* Add a mask band if needed */
    1142           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1143             :             {
    1144           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1145             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1146             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1147             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1148           0 :                             ->GetMaskBand());
    1149           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1150           0 :                                                  nRasterYSize, 0, 0,
    1151           0 :                                                  nRasterXSize, nRasterYSize);
    1152             :             }
    1153             :         }
    1154             : 
    1155         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1156         149 :         switch (psExtraArg->eResampleAlg)
    1157             :         {
    1158           0 :             case GRIORA_NearestNeighbour:
    1159           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1160           0 :                 break;
    1161         147 :             case GRIORA_Bilinear:
    1162         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1163         147 :                 break;
    1164           0 :             case GRIORA_Cubic:
    1165           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1166           0 :                 break;
    1167           0 :             case GRIORA_CubicSpline:
    1168           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1169           0 :                 break;
    1170           0 :             case GRIORA_Lanczos:
    1171           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1172           0 :                 break;
    1173           0 :             case GRIORA_Average:
    1174           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1175           0 :                 break;
    1176           2 :             case GRIORA_RMS:
    1177           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1178           2 :                 break;
    1179           0 :             case GRIORA_Mode:
    1180           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1181           0 :                 break;
    1182           0 :             default:
    1183           0 :                 CPLAssert(false);
    1184             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1185             :                 break;
    1186             :         }
    1187         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1188         149 :         psWarpOptions->hDstDS = poMEMDS;
    1189         149 :         psWarpOptions->nBandCount = 1;
    1190         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1191         149 :         int nDstBandNumber = 1;
    1192         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1193         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1194         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1195         149 :                                          ? psExtraArg->pfnProgress
    1196             :                                          : GDALDummyProgress;
    1197         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1198         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1199         149 :         if (bHasNoData)
    1200             :         {
    1201           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1202             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1203           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1204             :             {
    1205           0 :                 psWarpOptions->padfSrcNoDataReal =
    1206           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1207           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1208             :             }
    1209             : 
    1210           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1211             :             {
    1212           0 :                 psWarpOptions->padfDstNoDataReal =
    1213           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1214           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1215             :             }
    1216             :         }
    1217             : 
    1218             :         GDALRasterIOTransformerStruct sTransformer;
    1219         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1220         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1221         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1222         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1223         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1224             : 
    1225             :         GDALWarpOperationH hWarpOperation =
    1226         149 :             GDALCreateWarpOperation(psWarpOptions);
    1227         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1228             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1229         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1230             : 
    1231         149 :         psWarpOptions->panSrcBands = nullptr;
    1232         149 :         psWarpOptions->panDstBands = nullptr;
    1233         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1234             : 
    1235         149 :         if (hVRTDS)
    1236           0 :             GDALClose(hVRTDS);
    1237             :     }
    1238             :     else
    1239             :     {
    1240             :         const char *pszResampling =
    1241       15648 :             GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
    1242       15648 :         int nKernelRadius = 0;
    1243             :         GDALResampleFunction pfnResampleFunc =
    1244       15648 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1245       15648 :         CPLAssert(pfnResampleFunc);
    1246             :         GDALDataType eWrkDataType =
    1247       15648 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1248       15648 :         int nHasNoData = 0;
    1249       15648 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1250       15648 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1251       15648 :         if (!bHasNoData)
    1252       15516 :             dfNoDataValue = 0.0;
    1253             : 
    1254       15648 :         int nDstBlockXSize = nBufXSize;
    1255       15648 :         int nDstBlockYSize = nBufYSize;
    1256       15648 :         int nFullResXChunk = 0;
    1257       15648 :         int nFullResYChunk = 0;
    1258             :         while (true)
    1259             :         {
    1260       15659 :             nFullResXChunk = static_cast<int>(std::min<double>(
    1261       15659 :                 3 + nDstBlockXSize * dfXRatioDstToSrc, nRasterXSize));
    1262       15659 :             nFullResYChunk = static_cast<int>(std::min<double>(
    1263       15659 :                 3 + nDstBlockYSize * dfYRatioDstToSrc, nRasterYSize));
    1264       15659 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1265       15601 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1266             :                  1024 * 1024))
    1267             :                 break;
    1268             :             // When operating on the full width of a raster whose block width is
    1269             :             // the raster width, prefer doing chunks in height.
    1270          11 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1271             :                 nDstBlockYSize > 1)
    1272           0 :                 nDstBlockYSize /= 2;
    1273             :             /* Otherwise cut the maximal dimension */
    1274          11 :             else if (nDstBlockXSize > 1 &&
    1275           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1276          11 :                 nDstBlockXSize /= 2;
    1277             :             else
    1278           0 :                 nDstBlockYSize /= 2;
    1279             :         }
    1280             : 
    1281             :         const int nOvrXFactor =
    1282       15648 :             std::max(1, static_cast<int>(0.5 + dfXRatioDstToSrc));
    1283             :         const int nOvrYFactor =
    1284       15648 :             std::max(1, static_cast<int>(0.5 + dfYRatioDstToSrc));
    1285             :         const int nFullResXSizeQueried = static_cast<int>(
    1286       31296 :             std::min<int64_t>(nFullResXChunk + static_cast<int64_t>(2) *
    1287       15648 :                                                    nKernelRadius * nOvrXFactor,
    1288       15648 :                               nRasterXSize));
    1289             :         const int nFullResYSizeQueried = static_cast<int>(
    1290       31296 :             std::min<int64_t>(nFullResYChunk + static_cast<int64_t>(2) *
    1291       15648 :                                                    nKernelRadius * nOvrYFactor,
    1292       15648 :                               nRasterYSize));
    1293             : 
    1294             :         void *pChunk =
    1295       15648 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1296             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1297       15648 :         GByte *pabyChunkNoDataMask = nullptr;
    1298             : 
    1299       15648 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1300       15648 :         int l_nMaskFlags = GetMaskFlags();
    1301             : 
    1302       15648 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1303       15648 :         if (bUseNoDataMask)
    1304             :         {
    1305        7525 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1306             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1307             :         }
    1308       15648 :         if (pChunk == nullptr ||
    1309        7525 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1310             :         {
    1311           0 :             GDALClose(poMEMDS);
    1312           0 :             CPLFree(pChunk);
    1313           0 :             CPLFree(pabyChunkNoDataMask);
    1314           0 :             VSIFree(pTempBuffer);
    1315           0 :             return CE_Failure;
    1316             :         }
    1317             : 
    1318             :         const int64_t nTotalBlocks =
    1319       15648 :             static_cast<int64_t>(cpl::div_round_up(nBufXSize, nDstBlockXSize)) *
    1320       15648 :             cpl::div_round_up(nBufYSize, nDstBlockYSize);
    1321       15648 :         int64_t nBlocksDone = 0;
    1322             : 
    1323       31296 :         for (int nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1324       15648 :              nDstYOff += nDstBlockYSize)
    1325             :         {
    1326             :             int nDstYCount;
    1327       15648 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1328       15648 :                 nDstYCount = nDstBlockYSize;
    1329             :             else
    1330           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1331             : 
    1332       15648 :             int nChunkYOff =
    1333       15648 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1334       15648 :             int nChunkYOff2 = nYOff + 1 +
    1335       15648 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1336             :                                                     dfYRatioDstToSrc));
    1337       15648 :             if (nChunkYOff2 > nRasterYSize)
    1338         789 :                 nChunkYOff2 = nRasterYSize;
    1339       15648 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1340       15648 :             CPLAssert(nYCount <= nFullResYChunk);
    1341             : 
    1342       15648 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1343       15648 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1344       15648 :             if (nChunkYOffQueried < 0)
    1345             :             {
    1346         498 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1347         498 :                 nChunkYOffQueried = 0;
    1348             :             }
    1349       15648 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1350         607 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1351       15648 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1352             : 
    1353       15648 :             int nDstXOff = 0;
    1354       31296 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1355       15648 :                  nDstXOff += nDstBlockXSize)
    1356             :             {
    1357       15648 :                 int nDstXCount = 0;
    1358       15648 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1359       15648 :                     nDstXCount = nDstBlockXSize;
    1360             :                 else
    1361           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1362             : 
    1363       15648 :                 int nChunkXOff =
    1364       15648 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1365       15648 :                 int nChunkXOff2 =
    1366       15648 :                     nXOff + 1 +
    1367       15648 :                     static_cast<int>(
    1368       15648 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1369       15648 :                 if (nChunkXOff2 > nRasterXSize)
    1370        9827 :                     nChunkXOff2 = nRasterXSize;
    1371       15648 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1372       15648 :                 CPLAssert(nXCount <= nFullResXChunk);
    1373             : 
    1374       15648 :                 int nChunkXOffQueried =
    1375       15648 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1376       15648 :                 int nChunkXSizeQueried =
    1377       15648 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1378       15648 :                 if (nChunkXOffQueried < 0)
    1379             :                 {
    1380        3310 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1381        3310 :                     nChunkXOffQueried = 0;
    1382             :                 }
    1383       15648 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1384        3806 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1385       15648 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1386             : 
    1387             :                 // Read the source buffers.
    1388       15648 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1389             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1390             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1391             :                                 eWrkDataType, 0, 0, nullptr);
    1392             : 
    1393       15648 :                 bool bSkipResample = false;
    1394       15648 :                 bool bNoDataMaskFullyOpaque = false;
    1395       15648 :                 if (eErr == CE_None && bUseNoDataMask)
    1396             :                 {
    1397        7525 :                     eErr = poMaskBand->RasterIO(
    1398             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1399             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1400             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1401             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1402             : 
    1403             :                     /* Optimizations if mask if fully opaque or transparent */
    1404        7525 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1405        7525 :                     GByte bVal = pabyChunkNoDataMask[0];
    1406        7525 :                     int i = 1;
    1407    15237000 :                     for (; i < nPixels; i++)
    1408             :                     {
    1409    15230700 :                         if (pabyChunkNoDataMask[i] != bVal)
    1410        1168 :                             break;
    1411             :                     }
    1412        7525 :                     if (i == nPixels)
    1413             :                     {
    1414        6357 :                         if (bVal == 0)
    1415             :                         {
    1416       12094 :                             for (int j = 0; j < nDstYCount; j++)
    1417             :                             {
    1418        6377 :                                 GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
    1419             :                                                 static_cast<GByte *>(pDataMem) +
    1420        6377 :                                                     nLSMem * (j + nDstYOff) +
    1421        6377 :                                                     nDstXOff * nPSMem,
    1422             :                                                 eDTMem,
    1423             :                                                 static_cast<int>(nPSMem),
    1424             :                                                 nDstXCount);
    1425             :                             }
    1426        5717 :                             bSkipResample = true;
    1427             :                         }
    1428             :                         else
    1429             :                         {
    1430         640 :                             bNoDataMaskFullyOpaque = true;
    1431             :                         }
    1432             :                     }
    1433             :                 }
    1434             : 
    1435       15648 :                 if (!bSkipResample && eErr == CE_None)
    1436             :                 {
    1437        9928 :                     const bool bPropagateNoData = false;
    1438        9928 :                     void *pDstBuffer = nullptr;
    1439        9928 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1440             :                     GDALRasterBand *poMEMBand =
    1441        9928 :                         GDALRasterBand::FromHandle(hMEMBand);
    1442        9928 :                     GDALOverviewResampleArgs args;
    1443        9928 :                     args.eSrcDataType = eDataType;
    1444        9928 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1445        9928 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1446        9928 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1447        9928 :                     args.nOvrNBITS = nNBITS;
    1448        9928 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1449        9928 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1450        9928 :                     args.dfSrcXDelta =
    1451        9928 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1452        9928 :                     args.dfSrcYDelta =
    1453        9928 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1454        9928 :                     args.eWrkDataType = eWrkDataType;
    1455        9928 :                     args.pabyChunkNodataMask =
    1456        9928 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1457        9928 :                     args.nChunkXOff =
    1458        9928 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1459        9928 :                     args.nChunkXSize = nChunkXSizeQueried;
    1460        9928 :                     args.nChunkYOff =
    1461        9928 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1462        9928 :                     args.nChunkYSize = nChunkYSizeQueried;
    1463        9928 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1464        9928 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1465        9928 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1466        9928 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1467        9928 :                     args.pszResampling = pszResampling;
    1468        9928 :                     args.bHasNoData = bHasNoData;
    1469        9928 :                     args.dfNoDataValue = dfNoDataValue;
    1470        9928 :                     args.poColorTable = GetColorTable();
    1471        9928 :                     args.bPropagateNoData = bPropagateNoData;
    1472        9928 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1473             :                                            &eDstBufferDataType);
    1474        9928 :                     if (eErr == CE_None)
    1475             :                     {
    1476        9928 :                         eErr = poMEMBand->RasterIO(
    1477             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1478             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1479             :                             pDstBuffer, nDstXCount, nDstYCount,
    1480             :                             eDstBufferDataType, 0, 0, nullptr);
    1481             :                     }
    1482        9928 :                     CPLFree(pDstBuffer);
    1483             :                 }
    1484             : 
    1485       15648 :                 nBlocksDone++;
    1486       28106 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1487       12458 :                     !psExtraArg->pfnProgress(
    1488       12458 :                         static_cast<double>(nBlocksDone) /
    1489       12458 :                             static_cast<double>(nTotalBlocks),
    1490             :                         "", psExtraArg->pProgressData))
    1491             :                 {
    1492           1 :                     eErr = CE_Failure;
    1493             :                 }
    1494             :             }
    1495             :         }
    1496             : 
    1497       15648 :         CPLFree(pChunk);
    1498       15648 :         CPLFree(pabyChunkNoDataMask);
    1499             :     }
    1500             : 
    1501       15797 :     if (pTempBuffer)
    1502             :     {
    1503           4 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1504             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1505             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1506             :             nullptr));
    1507             :     }
    1508       15797 :     GDALClose(poMEMDS);
    1509       15797 :     VSIFree(pTempBuffer);
    1510             : 
    1511       15797 :     return eErr;
    1512             : }
    1513             : 
    1514             : /************************************************************************/
    1515             : /*                         RasterIOResampled()                          */
    1516             : /************************************************************************/
    1517             : 
    1518        2431 : CPLErr GDALDataset::RasterIOResampled(
    1519             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1520             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1521             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1522             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1523             : 
    1524             : {
    1525             : #if 0
    1526             :     // Determine if we use warping resampling or overview resampling
    1527             :     bool bUseWarp = false;
    1528             :     if( GDALDataTypeIsComplex( eDataType ) )
    1529             :         bUseWarp = true;
    1530             : #endif
    1531             : 
    1532        2431 :     double dfXOff = nXOff;
    1533        2431 :     double dfYOff = nYOff;
    1534        2431 :     double dfXSize = nXSize;
    1535        2431 :     double dfYSize = nYSize;
    1536        2431 :     if (psExtraArg->bFloatingPointWindowValidity)
    1537             :     {
    1538        2304 :         dfXOff = psExtraArg->dfXOff;
    1539        2304 :         dfYOff = psExtraArg->dfYOff;
    1540        2304 :         dfXSize = psExtraArg->dfXSize;
    1541        2304 :         dfYSize = psExtraArg->dfYSize;
    1542             :     }
    1543             : 
    1544        2431 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1545        2431 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1546             : 
    1547             :     // Determine the coordinates in the "virtual" output raster to see
    1548             :     // if there are not integers, in which case we will use them as a shift
    1549             :     // so that subwindow extracts give the exact same results as entire raster
    1550             :     // scaling.
    1551        2431 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1552        2431 :     bool bHasXOffVirtual = false;
    1553        2431 :     int nDestXOffVirtual = 0;
    1554        2431 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1555             :     {
    1556        2306 :         bHasXOffVirtual = true;
    1557        2306 :         dfXOff = nXOff;
    1558        2306 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1559             :     }
    1560             : 
    1561        2431 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1562        2431 :     bool bHasYOffVirtual = false;
    1563        2431 :     int nDestYOffVirtual = 0;
    1564        2431 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1565             :     {
    1566        2266 :         bHasYOffVirtual = true;
    1567        2266 :         dfYOff = nYOff;
    1568        2266 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1569             :     }
    1570             : 
    1571             :     // Create a MEM dataset that wraps the output buffer.
    1572        2431 :     std::unique_ptr<void, VSIFreeReleaser> pTempBuffer;
    1573        2431 :     GSpacing nPSMem = nPixelSpace;
    1574        2431 :     GSpacing nLSMem = nLineSpace;
    1575        2431 :     GSpacing nBandSpaceMEM = nBandSpace;
    1576        2431 :     void *pDataMem = pData;
    1577        2431 :     GDALDataType eDTMem = eBufType;
    1578        2431 :     GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1579        2431 :     const GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1580        2431 :     if (eBufType != eDataType && !GDAL_GET_OPERATE_IN_BUF_TYPE(*psExtraArg))
    1581             :     {
    1582           2 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1583           2 :         nLSMem = nPSMem * nBufXSize;
    1584           2 :         nBandSpaceMEM = nLSMem * nBandCount;
    1585           2 :         pTempBuffer.reset(VSI_MALLOC3_VERBOSE(nBandCount, nBufYSize,
    1586             :                                               static_cast<size_t>(nLSMem)));
    1587           2 :         if (pTempBuffer == nullptr)
    1588           0 :             return CE_Failure;
    1589           2 :         pDataMem = pTempBuffer.get();
    1590           2 :         eDTMem = eDataType;
    1591             :     }
    1592             : 
    1593             :     auto poMEMDS = std::unique_ptr<GDALDataset>(
    1594        2431 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1595        4862 :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr));
    1596             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1597             :     std::vector<GDALRasterBand *> apoDstBands(nBandCount);
    1598             : #endif
    1599        2431 :     int nNBITS = 0;
    1600        9052 :     for (int i = 0; i < nBandCount; i++)
    1601             :     {
    1602        6621 :         GByte *const pBandData = static_cast<GByte *>(pDataMem) -
    1603        6621 :                                  nPSMem * nDestXOffVirtual -
    1604        6621 :                                  nLSMem * nDestYOffVirtual + nBandSpaceMEM * i;
    1605        6621 :         auto poMEMBand = GDALRasterBand::FromHandle(MEMCreateRasterBandEx(
    1606             :             poMEMDS.get(), i + 1, pBandData, eDTMem, nPSMem, nLSMem, false));
    1607        6621 :         poMEMDS->SetBand(i + 1, poMEMBand);
    1608             : 
    1609        6621 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1610             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1611             :         apoDstBands[i] = poMEMBand;
    1612             : #endif
    1613             :         const char *pszNBITS =
    1614        6621 :             poSrcBand->GetMetadataItem(GDALMD_NBITS, GDAL_MDD_IMAGE_STRUCTURE);
    1615        6621 :         if (pszNBITS)
    1616             :         {
    1617           0 :             nNBITS = atoi(pszNBITS);
    1618           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem(
    1619           0 :                 GDALMD_NBITS, pszNBITS, GDAL_MDD_IMAGE_STRUCTURE);
    1620             :         }
    1621             :     }
    1622             : 
    1623        2431 :     CPLErr eErr = CE_None;
    1624             : 
    1625             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1626             :     // Looks like this code was initially added as disable by copying
    1627             :     // from RasterIO here:
    1628             :     // https://trac.osgeo.org/gdal/changeset/29572
    1629             : #if 0
    1630             :     // Do the resampling.
    1631             :     if( bUseWarp )
    1632             :     {
    1633             :         VRTDatasetH hVRTDS = nullptr;
    1634             :         GDALRasterBandH hVRTBand = nullptr;
    1635             :         if( GetDataset() == nullptr )
    1636             :         {
    1637             :             /* Create VRT dataset that wraps the whole dataset */
    1638             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1639             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1640             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1641             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1642             :                                 (GDALRasterBandH)this,
    1643             :                                 0, 0,
    1644             :                                 nRasterXSize, nRasterYSize,
    1645             :                                 0, 0,
    1646             :                                 nRasterXSize, nRasterYSize,
    1647             :                                 nullptr, VRT_NODATA_UNSET );
    1648             : 
    1649             :             /* Add a mask band if needed */
    1650             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1651             :             {
    1652             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1653             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1654             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1655             :                 poVRTMaskBand->
    1656             :                     AddMaskBandSource( this,
    1657             :                                     0, 0,
    1658             :                                     nRasterXSize, nRasterYSize,
    1659             :                                     0, 0,
    1660             :                                     nRasterXSize, nRasterYSize);
    1661             :             }
    1662             :         }
    1663             : 
    1664             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1665             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1666             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1667             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1668             :         psWarpOptions->nBandCount = 1;
    1669             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1670             :         int nDstBandNumber = 1;
    1671             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1672             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1673             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1674             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1675             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1676             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1677             :         GDALRasterIOTransformerStruct sTransformer;
    1678             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1679             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1680             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1681             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1682             :         psWarpOptions->pTransformerArg = &sTransformer;
    1683             : 
    1684             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1685             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1686             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1687             :                                       nBufXSize, nBufYSize );
    1688             :         GDALDestroyWarpOperation( hWarpOperation );
    1689             : 
    1690             :         psWarpOptions->panSrcBands = nullptr;
    1691             :         psWarpOptions->panDstBands = nullptr;
    1692             :         GDALDestroyWarpOptions( psWarpOptions );
    1693             : 
    1694             :         if( hVRTDS )
    1695             :             GDALClose(hVRTDS);
    1696             :     }
    1697             :     else
    1698             : #endif
    1699             :     {
    1700             :         const char *pszResampling =
    1701        2431 :             GDALRasterIOGetResampleAlg(psExtraArg->eResampleAlg);
    1702             : 
    1703             :         int nBlockXSize, nBlockYSize;
    1704        2431 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1705             : 
    1706             :         int nKernelRadius;
    1707             :         GDALResampleFunction pfnResampleFunc =
    1708        2431 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1709        2431 :         CPLAssert(pfnResampleFunc);
    1710             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1711             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1712             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1713             : #endif
    1714             :         GDALDataType eWrkDataType =
    1715        2431 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1716             : 
    1717        2431 :         int nDstBlockXSize = nBufXSize;
    1718        2431 :         int nDstBlockYSize = nBufYSize;
    1719             :         int nFullResXChunk, nFullResYChunk;
    1720             :         while (true)
    1721             :         {
    1722        2431 :             nFullResXChunk = static_cast<int>(std::min<double>(
    1723        2431 :                 3 + nDstBlockXSize * dfXRatioDstToSrc, nRasterXSize));
    1724        2431 :             nFullResYChunk = static_cast<int>(std::min<double>(
    1725        2431 :                 3 + nDstBlockYSize * dfYRatioDstToSrc, nRasterYSize));
    1726        2431 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1727        2429 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1728             :                  1024 * 1024))
    1729             :                 break;
    1730             :             // When operating on the full width of a raster whose block width is
    1731             :             // the raster width, prefer doing chunks in height.
    1732           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1733             :                 nDstBlockYSize > 1)
    1734           0 :                 nDstBlockYSize /= 2;
    1735             :             /* Otherwise cut the maximal dimension */
    1736           0 :             else if (nDstBlockXSize > 1 &&
    1737           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1738           0 :                 nDstBlockXSize /= 2;
    1739             :             else
    1740           0 :                 nDstBlockYSize /= 2;
    1741             :         }
    1742             : 
    1743             :         const int nOvrFactor =
    1744        7293 :             std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1745        2431 :                                  static_cast<int>(0.5 + dfYRatioDstToSrc)));
    1746             :         const int nFullResXSizeQueried = static_cast<int>(
    1747        4862 :             std::min<int64_t>(nFullResXChunk + static_cast<int64_t>(2) *
    1748        2431 :                                                    nKernelRadius * nOvrFactor,
    1749        2431 :                               nRasterXSize));
    1750             :         const int nFullResYSizeQueried = static_cast<int>(
    1751        4862 :             std::min<int64_t>(nFullResYChunk + static_cast<int64_t>(2) *
    1752        2431 :                                                    nKernelRadius * nOvrFactor,
    1753        2431 :                               nRasterYSize));
    1754             : 
    1755        2431 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1756             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1757             :                               nBandCount),
    1758             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1759        2431 :         GByte *pabyChunkNoDataMask = nullptr;
    1760             : 
    1761        2431 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1762        2431 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1763             : 
    1764        2431 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1765        2431 :         if (bUseNoDataMask)
    1766             :         {
    1767        2156 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1768             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1769             :         }
    1770        2431 :         if (pChunk == nullptr ||
    1771        2156 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1772             :         {
    1773           0 :             CPLFree(pChunk);
    1774           0 :             CPLFree(pabyChunkNoDataMask);
    1775           0 :             return CE_Failure;
    1776             :         }
    1777             : 
    1778             :         const int64_t nTotalBlocks =
    1779        2431 :             static_cast<int64_t>(cpl::div_round_up(nBufXSize, nDstBlockXSize)) *
    1780        2431 :             cpl::div_round_up(nBufYSize, nDstBlockYSize);
    1781        2431 :         int64_t nBlocksDone = 0;
    1782             : 
    1783        4862 :         for (int nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1784        2431 :              nDstYOff += nDstBlockYSize)
    1785             :         {
    1786             :             int nDstYCount;
    1787        2431 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1788        2431 :                 nDstYCount = nDstBlockYSize;
    1789             :             else
    1790           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1791             : 
    1792        2431 :             int nChunkYOff =
    1793        2431 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1794        2431 :             int nChunkYOff2 = nYOff + 1 +
    1795        2431 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1796             :                                                     dfYRatioDstToSrc));
    1797        2431 :             if (nChunkYOff2 > nRasterYSize)
    1798         146 :                 nChunkYOff2 = nRasterYSize;
    1799        2431 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1800        2431 :             CPLAssert(nYCount <= nFullResYChunk);
    1801             : 
    1802        2431 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1803        2431 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1804        2431 :             if (nChunkYOffQueried < 0)
    1805             :             {
    1806         149 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1807         149 :                 nChunkYOffQueried = 0;
    1808             :             }
    1809        2431 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1810         170 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1811        2431 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1812             : 
    1813             :             int nDstXOff;
    1814        4862 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1815        2431 :                  nDstXOff += nDstBlockXSize)
    1816             :             {
    1817             :                 int nDstXCount;
    1818        2431 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1819        2431 :                     nDstXCount = nDstBlockXSize;
    1820             :                 else
    1821           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1822             : 
    1823        2431 :                 int nChunkXOff =
    1824        2431 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1825        2431 :                 int nChunkXOff2 =
    1826        2431 :                     nXOff + 1 +
    1827        2431 :                     static_cast<int>(
    1828        2431 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1829        2431 :                 if (nChunkXOff2 > nRasterXSize)
    1830        1672 :                     nChunkXOff2 = nRasterXSize;
    1831        2431 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1832        2431 :                 CPLAssert(nXCount <= nFullResXChunk);
    1833             : 
    1834        2431 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1835        2431 :                 int nChunkXSizeQueried =
    1836        2431 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1837        2431 :                 if (nChunkXOffQueried < 0)
    1838             :                 {
    1839        1162 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1840        1162 :                     nChunkXOffQueried = 0;
    1841             :                 }
    1842        2431 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1843        1680 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1844        2431 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1845             : 
    1846        2431 :                 bool bSkipResample = false;
    1847        2431 :                 bool bNoDataMaskFullyOpaque = false;
    1848        2431 :                 if (eErr == CE_None && bUseNoDataMask)
    1849             :                 {
    1850        2156 :                     eErr = poMaskBand->RasterIO(
    1851             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1852             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1853             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1854             :                         nChunkYSizeQueried, GDT_UInt8, 0, 0, nullptr);
    1855             : 
    1856             :                     /* Optimizations if mask if fully opaque or transparent */
    1857        2156 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1858        2156 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1859        2156 :                     int i = 1;  // Used after for.
    1860    49799600 :                     for (; i < nPixels; i++)
    1861             :                     {
    1862    49798500 :                         if (pabyChunkNoDataMask[i] != bVal)
    1863        1031 :                             break;
    1864             :                     }
    1865        2156 :                     if (i == nPixels)
    1866             :                     {
    1867        1125 :                         if (bVal == 0)
    1868             :                         {
    1869         953 :                             GByte abyZero[16] = {0};
    1870        3100 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1871             :                             {
    1872        6979 :                                 for (int j = 0; j < nDstYCount; j++)
    1873             :                                 {
    1874        4832 :                                     GDALCopyWords64(
    1875             :                                         abyZero, GDT_UInt8, 0,
    1876             :                                         static_cast<GByte *>(pDataMem) +
    1877        4832 :                                             iBand * nBandSpaceMEM +
    1878        4832 :                                             nLSMem * (j + nDstYOff) +
    1879        4832 :                                             nDstXOff * nPSMem,
    1880             :                                         eBufType, static_cast<int>(nPSMem),
    1881             :                                         nDstXCount);
    1882             :                                 }
    1883             :                             }
    1884         953 :                             bSkipResample = true;
    1885             :                         }
    1886             :                         else
    1887             :                         {
    1888         172 :                             bNoDataMaskFullyOpaque = true;
    1889             :                         }
    1890             :                     }
    1891             :                 }
    1892             : 
    1893        2431 :                 if (!bSkipResample && eErr == CE_None)
    1894             :                 {
    1895             :                     /* Read the source buffers */
    1896        1475 :                     eErr = RasterIO(
    1897             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1898             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1899             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1900             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1901             :                 }
    1902             : 
    1903             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1904             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1905             :                     eErr == CE_None)
    1906             :                 {
    1907             :                     eErr = pfnResampleFuncMultiBands(
    1908             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1909             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1910             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1911             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1912             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1913             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1914             :                         nChunkXSizeQueried,
    1915             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1916             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1917             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1918             :                         nDstYOff + nDestYOffVirtual,
    1919             :                         nDstYOff + nDestYOffVirtual + nDstYCount,
    1920             :                         apoDstBands.data(), pszResampling, FALSE /*bHasNoData*/,
    1921             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1922             :                         eDataType);
    1923             :                 }
    1924             :                 else
    1925             : #endif
    1926             :                 {
    1927             :                     size_t nChunkBandOffset =
    1928        2431 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1929        2431 :                         nChunkYSizeQueried *
    1930        2431 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1931        6896 :                     for (int i = 0;
    1932        6896 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1933             :                          i++)
    1934             :                     {
    1935        4465 :                         const bool bPropagateNoData = false;
    1936        4465 :                         void *pDstBuffer = nullptr;
    1937        4465 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1938             :                         GDALRasterBand *poMEMBand =
    1939        4465 :                             poMEMDS->GetRasterBand(i + 1);
    1940        4465 :                         GDALOverviewResampleArgs args;
    1941        4465 :                         args.eSrcDataType = eDataType;
    1942        4465 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1943        4465 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1944        4465 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1945        4465 :                         args.nOvrNBITS = nNBITS;
    1946        4465 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1947        4465 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1948        4465 :                         args.dfSrcXDelta =
    1949        4465 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1950        4465 :                         args.dfSrcYDelta =
    1951        4465 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1952        4465 :                         args.eWrkDataType = eWrkDataType;
    1953        4465 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1954        4465 :                                                        ? nullptr
    1955             :                                                        : pabyChunkNoDataMask;
    1956        4465 :                         args.nChunkXOff =
    1957        4465 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1958        4465 :                         args.nChunkXSize = nChunkXSizeQueried;
    1959        4465 :                         args.nChunkYOff =
    1960        4465 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1961        4465 :                         args.nChunkYSize = nChunkYSizeQueried;
    1962        4465 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1963        4465 :                         args.nDstXOff2 =
    1964        4465 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1965        4465 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1966        4465 :                         args.nDstYOff2 =
    1967        4465 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1968        4465 :                         args.pszResampling = pszResampling;
    1969        4465 :                         args.bHasNoData = false;
    1970        4465 :                         args.dfNoDataValue = 0.0;
    1971        4465 :                         args.poColorTable = nullptr;
    1972        4465 :                         args.bPropagateNoData = bPropagateNoData;
    1973             : 
    1974             :                         eErr =
    1975        8930 :                             pfnResampleFunc(args,
    1976        4465 :                                             reinterpret_cast<GByte *>(pChunk) +
    1977        4465 :                                                 i * nChunkBandOffset,
    1978             :                                             &pDstBuffer, &eDstBufferDataType);
    1979        4465 :                         if (eErr == CE_None)
    1980             :                         {
    1981        4465 :                             eErr = poMEMBand->RasterIO(
    1982             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1983             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1984             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1985             :                                 eDstBufferDataType, 0, 0, nullptr);
    1986             :                         }
    1987        4465 :                         CPLFree(pDstBuffer);
    1988             :                     }
    1989             :                 }
    1990             : 
    1991        2431 :                 nBlocksDone++;
    1992        4356 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1993        1925 :                     !psExtraArg->pfnProgress(
    1994        1925 :                         static_cast<double>(nBlocksDone) /
    1995        1925 :                             static_cast<double>(nTotalBlocks),
    1996             :                         "", psExtraArg->pProgressData))
    1997             :                 {
    1998           0 :                     eErr = CE_Failure;
    1999             :                 }
    2000             :             }
    2001             :         }
    2002             : 
    2003        2431 :         CPLFree(pChunk);
    2004        2431 :         CPLFree(pabyChunkNoDataMask);
    2005             :     }
    2006             : 
    2007        2431 :     if (pTempBuffer)
    2008             :     {
    2009           2 :         CPL_IGNORE_RET_VAL(poMEMDS->RasterIO(
    2010             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    2011             :             pData, nBufXSize, nBufYSize, eBufType, nBandCount, nullptr,
    2012             :             nPixelSpace, nLineSpace, nBandSpace, nullptr));
    2013             :     }
    2014             : 
    2015        2431 :     return eErr;
    2016             : }
    2017             : 
    2018             : //! @endcond
    2019             : 
    2020             : /************************************************************************/
    2021             : /*                           GDALSwapWords()                            */
    2022             : /************************************************************************/
    2023             : 
    2024             : /**
    2025             :  * Byte swap words in-place.
    2026             :  *
    2027             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2028             :  * a memory array.  No assumption is made that the words being swapped are
    2029             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2030             :  * to determine if the current platform is big endian or little endian.  Use
    2031             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2032             :  * of a function call.
    2033             :  *
    2034             :  * @param pData pointer to start of data buffer.
    2035             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2036             :  * @param nWordCount the number of words to be swapped in this call.
    2037             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2038             :  * the next. For packed buffers this is the same as nWordSize.
    2039             :  */
    2040             : 
    2041      497405 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2042             :                                int nWordSkip)
    2043             : 
    2044             : {
    2045      497405 :     if (nWordCount > 0)
    2046      497405 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2047             : 
    2048      497405 :     GByte *pabyData = static_cast<GByte *>(pData);
    2049             : 
    2050      497405 :     switch (nWordSize)
    2051             :     {
    2052        7234 :         case 1:
    2053        7234 :             break;
    2054             : 
    2055      477161 :         case 2:
    2056      477161 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2057   228194000 :             for (int i = 0; i < nWordCount; i++)
    2058             :             {
    2059   227716000 :                 CPL_SWAP16PTR(pabyData);
    2060   227716000 :                 pabyData += nWordSkip;
    2061             :             }
    2062      477161 :             break;
    2063             : 
    2064       10584 :         case 4:
    2065       10584 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2066       10584 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2067             :             {
    2068    29140600 :                 for (int i = 0; i < nWordCount; i++)
    2069             :                 {
    2070    29130000 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2071             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2072    29130000 :                     pabyData += nWordSkip;
    2073       10581 :                 }
    2074             :             }
    2075             :             else
    2076             :             {
    2077           9 :                 for (int i = 0; i < nWordCount; i++)
    2078             :                 {
    2079           6 :                     CPL_SWAP32PTR(pabyData);
    2080           6 :                     pabyData += nWordSkip;
    2081             :                 }
    2082             :             }
    2083       10584 :             break;
    2084             : 
    2085        2426 :         case 8:
    2086        2426 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2087        2426 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2088             :             {
    2089     3356900 :                 for (int i = 0; i < nWordCount; i++)
    2090             :                 {
    2091     3354480 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2092             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2093     3354480 :                     pabyData += nWordSkip;
    2094        2425 :                 }
    2095             :             }
    2096             :             else
    2097             :             {
    2098           3 :                 for (int i = 0; i < nWordCount; i++)
    2099             :                 {
    2100           2 :                     CPL_SWAP64PTR(pabyData);
    2101           2 :                     pabyData += nWordSkip;
    2102             :                 }
    2103             :             }
    2104        2426 :             break;
    2105             : 
    2106           0 :         default:
    2107           0 :             CPLAssert(false);
    2108             :     }
    2109             : }
    2110             : 
    2111             : /************************************************************************/
    2112             : /*                          GDALSwapWordsEx()                           */
    2113             : /************************************************************************/
    2114             : 
    2115             : /**
    2116             :  * Byte swap words in-place.
    2117             :  *
    2118             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2119             :  * a memory array.  No assumption is made that the words being swapped are
    2120             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2121             :  * to determine if the current platform is big endian or little endian.  Use
    2122             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2123             :  * of a function call.
    2124             :  *
    2125             :  * @param pData pointer to start of data buffer.
    2126             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2127             :  * @param nWordCount the number of words to be swapped in this call.
    2128             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2129             :  * the next. For packed buffers this is the same as nWordSize.
    2130             :  */
    2131        6130 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2132             :                                  int nWordSkip)
    2133             : {
    2134        6130 :     GByte *pabyData = static_cast<GByte *>(pData);
    2135       12260 :     while (nWordCount)
    2136             :     {
    2137             :         // Pick-up a multiple of 8 as max chunk size.
    2138        6130 :         const int nWordCountSmall =
    2139        6130 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2140        6130 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2141        6130 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2142        6130 :         nWordCount -= nWordCountSmall;
    2143             :     }
    2144        6130 : }
    2145             : 
    2146             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2147             : namespace
    2148             : {
    2149             : 
    2150             : /************************************************************************/
    2151             : /*                           GDALCopyWordsT()                           */
    2152             : /************************************************************************/
    2153             : /**
    2154             :  * Template function, used to copy data from pSrcData into buffer
    2155             :  * pDstData, with stride nSrcPixelStride in the source data and
    2156             :  * stride nDstPixelStride in the destination data. This template can
    2157             :  * deal with the case where the input data type is real or complex and
    2158             :  * the output is real.
    2159             :  *
    2160             :  * @param pSrcData the source data buffer
    2161             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2162             :  *                      of interest.
    2163             :  * @param pDstData the destination buffer.
    2164             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2165             :  *                      interest.
    2166             :  * @param nWordCount the total number of pixel words to copy
    2167             :  *
    2168             :  * @code
    2169             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2170             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2171             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2172             :  * @endcode
    2173             :  * @note
    2174             :  * This is a private function, and should not be exposed outside of
    2175             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2176             :  */
    2177             : 
    2178             : template <class Tin, class Tout>
    2179    48995023 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2180             :                                          int nSrcPixelStride,
    2181             :                                          Tout *const CPL_RESTRICT pDstData,
    2182             :                                          int nDstPixelStride,
    2183             :                                          GPtrDiff_t nWordCount)
    2184             : {
    2185    48995023 :     decltype(nWordCount) nDstOffset = 0;
    2186             : 
    2187    48995023 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2188    48995023 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2189   353965335 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2190             :     {
    2191   304970201 :         const Tin tValue =
    2192   304970201 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2193   304970201 :         Tout *const pOutPixel =
    2194   304970201 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2195             : 
    2196   304970201 :         GDALCopyWord(tValue, *pOutPixel);
    2197             : 
    2198   304970201 :         nDstOffset += nDstPixelStride;
    2199             :     }
    2200    48995023 : }
    2201             : 
    2202             : template <class Tin, class Tout>
    2203    29747008 : static void CPL_NOINLINE GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2204             :                                         int nSrcPixelStride,
    2205             :                                         Tout *const CPL_RESTRICT pDstData,
    2206             :                                         int nDstPixelStride,
    2207             :                                         GPtrDiff_t nWordCount)
    2208             : {
    2209    29747008 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2210             :                           nWordCount);
    2211    29747008 : }
    2212             : 
    2213             : template <class Tin, class Tout>
    2214     5105605 : static void inline GDALCopyWordsT_8atatime(
    2215             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2216             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2217             :     GPtrDiff_t nWordCount)
    2218             : {
    2219     5105605 :     decltype(nWordCount) nDstOffset = 0;
    2220             : 
    2221     5105605 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2222     5105605 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2223     5105605 :     decltype(nWordCount) n = 0;
    2224     5105605 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2225             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2226             :     {
    2227    53259210 :         for (; n < nWordCount - 7; n += 8)
    2228             :         {
    2229    52704384 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2230    52704384 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2231    52704384 :             Tout *const pOutPixels =
    2232    52704384 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2233             : 
    2234    52704384 :             GDALCopy8Words(pInValues, pOutPixels);
    2235             : 
    2236    52704384 :             nDstOffset += 8 * nDstPixelStride;
    2237             :         }
    2238             :     }
    2239    10508897 :     for (; n < nWordCount; n++)
    2240             :     {
    2241     5403302 :         const Tin tValue =
    2242     5403302 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2243     5403302 :         Tout *const pOutPixel =
    2244     5403302 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2245             : 
    2246     5403302 :         GDALCopyWord(tValue, *pOutPixel);
    2247             : 
    2248     5403302 :         nDstOffset += nDstPixelStride;
    2249             :     }
    2250     5105605 : }
    2251             : 
    2252             : #ifdef HAVE_SSE2
    2253             : 
    2254             : template <class Tout>
    2255     1042126 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2256             :                               int nSrcPixelStride,
    2257             :                               Tout *const CPL_RESTRICT pDstData,
    2258             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2259             : {
    2260             :     static_assert(std::is_integral<Tout>::value &&
    2261             :                       sizeof(Tout) == sizeof(uint16_t),
    2262             :                   "Bad Tout");
    2263     1042126 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2264             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2265             :     {
    2266       35752 :         decltype(nWordCount) n = 0;
    2267       35752 :         const __m128i xmm_zero = _mm_setzero_si128();
    2268       35752 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2269             :             reinterpret_cast<GByte *>(pDstData);
    2270     1478148 :         for (; n < nWordCount - 15; n += 16)
    2271             :         {
    2272     1442396 :             __m128i xmm = _mm_loadu_si128(
    2273     1442396 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2274     1442396 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2275     1442396 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2276             :             _mm_storeu_si128(
    2277     1442396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2278             :             _mm_storeu_si128(
    2279     1442396 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2280             :         }
    2281             : #if defined(__clang__)
    2282             : #pragma clang loop vectorize(disable)
    2283             : #endif
    2284      111662 :         for (; n < nWordCount; n++)
    2285             :         {
    2286       75910 :             pDstData[n] = pSrcData[n];
    2287       35752 :         }
    2288             :     }
    2289             :     else
    2290             :     {
    2291     1006371 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2292             :                               nDstPixelStride, nWordCount);
    2293             :     }
    2294     1042126 : }
    2295             : 
    2296             : template <>
    2297     1029400 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2298             :                                  int nSrcPixelStride,
    2299             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    2300             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2301             : {
    2302     1029400 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2303             :                              nDstPixelStride, nWordCount);
    2304     1029400 : }
    2305             : 
    2306             : template <>
    2307       12726 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2308             :                                  int nSrcPixelStride,
    2309             :                                  GInt16 *const CPL_RESTRICT pDstData,
    2310             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2311             : {
    2312       12726 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2313             :                              nDstPixelStride, nWordCount);
    2314       12726 : }
    2315             : 
    2316             : template <class Tout>
    2317    16237876 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2318             :                               int nSrcPixelStride,
    2319             :                               Tout *const CPL_RESTRICT pDstData,
    2320             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2321             : {
    2322             :     static_assert(std::is_integral<Tout>::value &&
    2323             :                       sizeof(Tout) == sizeof(uint32_t),
    2324             :                   "Bad Tout");
    2325    16237876 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2326             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2327             :     {
    2328     6533346 :         decltype(nWordCount) n = 0;
    2329     6533346 :         const __m128i xmm_zero = _mm_setzero_si128();
    2330     6533346 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2331             :             reinterpret_cast<GByte *>(pDstData);
    2332    74249727 :         for (; n < nWordCount - 15; n += 16)
    2333             :         {
    2334    67716361 :             __m128i xmm = _mm_loadu_si128(
    2335    67716361 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2336    67716361 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2337    67716361 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2338    67716361 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2339    67716361 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2340    67716361 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2341    67716361 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2342             :             _mm_storeu_si128(
    2343    67716361 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2344             :             _mm_storeu_si128(
    2345    67716361 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2346             :             _mm_storeu_si128(
    2347    67716361 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2348             :             _mm_storeu_si128(
    2349    67716361 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2350             :         }
    2351             : #if defined(__clang__)
    2352             : #pragma clang loop vectorize(disable)
    2353             : #endif
    2354    14830716 :         for (; n < nWordCount; n++)
    2355             :         {
    2356     8297350 :             pDstData[n] = pSrcData[n];
    2357     6533346 :         }
    2358             :     }
    2359             :     else
    2360             :     {
    2361     9704510 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2362             :                               nDstPixelStride, nWordCount);
    2363             :     }
    2364    16237876 : }
    2365             : 
    2366             : template <>
    2367         476 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2368             :                                  int nSrcPixelStride,
    2369             :                                  GUInt32 *const CPL_RESTRICT pDstData,
    2370             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2371             : {
    2372         476 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2373             :                              nDstPixelStride, nWordCount);
    2374         476 : }
    2375             : 
    2376             : template <>
    2377    16237400 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2378             :                                  int nSrcPixelStride,
    2379             :                                  GInt32 *const CPL_RESTRICT pDstData,
    2380             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2381             : {
    2382    16237400 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2383             :                              nDstPixelStride, nWordCount);
    2384    16237400 : }
    2385             : 
    2386             : template <>
    2387     2851220 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2388             :                                  int nSrcPixelStride,
    2389             :                                  float *const CPL_RESTRICT pDstData,
    2390             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2391             : {
    2392     2851220 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2393             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2394             :     {
    2395      228331 :         decltype(nWordCount) n = 0;
    2396      228331 :         const __m128i xmm_zero = _mm_setzero_si128();
    2397      228331 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2398             :             reinterpret_cast<GByte *>(pDstData);
    2399     2267440 :         for (; n < nWordCount - 15; n += 16)
    2400             :         {
    2401     2039110 :             __m128i xmm = _mm_loadu_si128(
    2402     2039110 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2403     2039110 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2404     2039110 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2405     2039110 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2406     2039110 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2407     2039110 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2408     2039110 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2409     2039110 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2410     2039110 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2411     2039110 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2412     2039110 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2413     2039110 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2414             :                           xmm0_f);
    2415             :             _mm_storeu_ps(
    2416     2039110 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2417             :             _mm_storeu_ps(
    2418     2039110 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2419             :             _mm_storeu_ps(
    2420     2039110 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2421             :         }
    2422             : #if defined(__clang__)
    2423             : #pragma clang loop vectorize(disable)
    2424             : #endif
    2425      952143 :         for (; n < nWordCount; n++)
    2426             :         {
    2427      723812 :             pDstData[n] = pSrcData[n];
    2428      228331 :         }
    2429             :     }
    2430             :     else
    2431             :     {
    2432     2622880 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2433             :                               nDstPixelStride, nWordCount);
    2434             :     }
    2435     2851220 : }
    2436             : 
    2437             : template <>
    2438      180298 : CPL_NOINLINE void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2439             :                                  int nSrcPixelStride,
    2440             :                                  double *const CPL_RESTRICT pDstData,
    2441             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2442             : {
    2443      180298 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2444             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2445             :     {
    2446      146836 :         decltype(nWordCount) n = 0;
    2447      146836 :         const __m128i xmm_zero = _mm_setzero_si128();
    2448      146836 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2449             :             reinterpret_cast<GByte *>(pDstData);
    2450     3125780 :         for (; n < nWordCount - 15; n += 16)
    2451             :         {
    2452     2978940 :             __m128i xmm = _mm_loadu_si128(
    2453     2978940 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2454     2978940 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2455     2978940 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2456     2978940 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2457     2978940 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2458     2978940 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2459     2978940 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2460             : 
    2461             : #if defined(__AVX2__) && defined(slightly_slower_than_SSE2)
    2462             :             _mm256_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2463             :                              _mm256_cvtepi32_pd(xmm0));
    2464             :             _mm256_storeu_pd(
    2465             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2466             :                 _mm256_cvtepi32_pd(xmm1));
    2467             :             _mm256_storeu_pd(
    2468             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2469             :                 _mm256_cvtepi32_pd(xmm2));
    2470             :             _mm256_storeu_pd(
    2471             :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2472             :                 _mm256_cvtepi32_pd(xmm3));
    2473             : #else
    2474     2978940 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2475     2978940 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2476     2978940 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2477     2978940 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2478     2978940 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2479     2978940 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2480     2978940 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2481     2978940 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2482     2978940 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2483     2978940 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2484     2978940 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2485     2978940 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2486             : 
    2487     2978940 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2488             :                           xmm0_low_d);
    2489             :             _mm_storeu_pd(
    2490     2978940 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2491             :                 xmm0_high_d);
    2492             :             _mm_storeu_pd(
    2493     2978940 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2494             :                 xmm1_low_d);
    2495             :             _mm_storeu_pd(
    2496     2978940 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2497             :                 xmm1_high_d);
    2498             :             _mm_storeu_pd(
    2499     2978940 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2500             :                 xmm2_low_d);
    2501             :             _mm_storeu_pd(
    2502     2978940 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2503             :                 xmm2_high_d);
    2504             :             _mm_storeu_pd(
    2505     2978940 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2506             :                 xmm3_low_d);
    2507             :             _mm_storeu_pd(
    2508     2978940 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2509             :                 xmm3_high_d);
    2510             : #endif
    2511             :         }
    2512             : #if defined(__clang__)
    2513             : #pragma clang loop vectorize(disable)
    2514             : #endif
    2515      279978 :         for (; n < nWordCount; n++)
    2516             :         {
    2517      133142 :             pDstData[n] = pSrcData[n];
    2518      146836 :         }
    2519             :     }
    2520             :     else
    2521             :     {
    2522       33462 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2523             :                               nDstPixelStride, nWordCount);
    2524             :     }
    2525      180298 : }
    2526             : 
    2527             : template <>
    2528         148 : CPL_NOINLINE void GDALCopyWordsT(const uint8_t *const CPL_RESTRICT pSrcData,
    2529             :                                  int nSrcPixelStride,
    2530             :                                  int8_t *const CPL_RESTRICT pDstData,
    2531             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2532             : {
    2533         148 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2534             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2535             :     {
    2536         142 :         decltype(nWordCount) n = 0;
    2537         142 :         const __m128i xmm_127 = _mm_set1_epi8(127);
    2538         146 :         for (; n < nWordCount - 31; n += 32)
    2539             :         {
    2540           8 :             __m128i xmm0 = _mm_loadu_si128(
    2541           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2542           4 :             __m128i xmm1 = _mm_loadu_si128(
    2543           4 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2544           4 :             xmm0 = _mm_min_epu8(xmm0, xmm_127);
    2545           4 :             xmm1 = _mm_min_epu8(xmm1, xmm_127);
    2546           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2547           4 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2548             :                              xmm1);
    2549             :         }
    2550             : #if defined(__clang__)
    2551             : #pragma clang loop vectorize(disable)
    2552             : #endif
    2553        2424 :         for (; n < nWordCount; n++)
    2554             :         {
    2555        2282 :             pDstData[n] = static_cast<int8_t>(std::min<int>(pSrcData[n], 127));
    2556         142 :         }
    2557             :     }
    2558             :     else
    2559             :     {
    2560           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2561             :                               nDstPixelStride, nWordCount);
    2562             :     }
    2563         148 : }
    2564             : 
    2565             : template <>
    2566          62 : CPL_NOINLINE void GDALCopyWordsT(const int8_t *const CPL_RESTRICT pSrcData,
    2567             :                                  int nSrcPixelStride,
    2568             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2569             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2570             : {
    2571          62 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2572             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2573             :     {
    2574          56 :         decltype(nWordCount) n = 0;
    2575             : #if !(defined(__SSE4_1__) || defined(__AVX__) ||                               \
    2576             :       defined(USE_NEON_OPTIMIZATIONS))
    2577          56 :         const __m128i xmm_INT8_to_UINT8 = _mm_set1_epi8(-128);
    2578             : #endif
    2579         117 :         for (; n < nWordCount - 31; n += 32)
    2580             :         {
    2581         122 :             __m128i xmm0 = _mm_loadu_si128(
    2582          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2583          61 :             __m128i xmm1 = _mm_loadu_si128(
    2584          61 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    2585             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    2586             :             xmm0 = _mm_max_epi8(xmm0, _mm_setzero_si128());
    2587             :             xmm1 = _mm_max_epi8(xmm1, _mm_setzero_si128());
    2588             : #else
    2589          61 :             xmm0 = _mm_add_epi8(xmm0, xmm_INT8_to_UINT8);
    2590          61 :             xmm1 = _mm_add_epi8(xmm1, xmm_INT8_to_UINT8);
    2591          61 :             xmm0 = _mm_max_epu8(xmm0, xmm_INT8_to_UINT8);
    2592          61 :             xmm1 = _mm_max_epu8(xmm1, xmm_INT8_to_UINT8);
    2593          61 :             xmm0 = _mm_sub_epi8(xmm0, xmm_INT8_to_UINT8);
    2594          61 :             xmm1 = _mm_sub_epi8(xmm1, xmm_INT8_to_UINT8);
    2595             : #endif
    2596          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2597          61 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    2598             :                              xmm1);
    2599             :         }
    2600             : #if defined(__clang__)
    2601             : #pragma clang loop vectorize(disable)
    2602             : #endif
    2603         352 :         for (; n < nWordCount; n++)
    2604             :         {
    2605         296 :             pDstData[n] = static_cast<uint8_t>(std::max<int>(pSrcData[n], 0));
    2606          56 :         }
    2607             :     }
    2608             :     else
    2609             :     {
    2610           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2611             :                               nDstPixelStride, nWordCount);
    2612             :     }
    2613          62 : }
    2614             : 
    2615             : template <>
    2616        6037 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2617             :                                  int nSrcPixelStride,
    2618             :                                  uint8_t *const CPL_RESTRICT pDstData,
    2619             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2620             : {
    2621        6037 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2622             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2623             :     {
    2624        5062 :         decltype(nWordCount) n = 0;
    2625             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    2626             :         const auto xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2627             : #else
    2628             :         // In SSE2, min_epu16 does not exist, so shift from
    2629             :         // UInt16 to SInt16 to be able to use min_epi16
    2630        5062 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2631        5062 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2632             : #endif
    2633       71888 :         for (; n < nWordCount - 15; n += 16)
    2634             :         {
    2635      133652 :             __m128i xmm0 = _mm_loadu_si128(
    2636       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2637       66826 :             __m128i xmm1 = _mm_loadu_si128(
    2638       66826 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2639             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    2640             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2641             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2642             : #else
    2643       66826 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2644       66826 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2645       66826 :             xmm0 = _mm_min_epi16(xmm0, xmm_m255_shifted);
    2646       66826 :             xmm1 = _mm_min_epi16(xmm1, xmm_m255_shifted);
    2647       66826 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2648       66826 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2649             : #endif
    2650       66826 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    2651       66826 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2652             :         }
    2653             : #if defined(__clang__)
    2654             : #pragma clang loop vectorize(disable)
    2655             : #endif
    2656       16403 :         for (; n < nWordCount; n++)
    2657             :         {
    2658       11341 :             pDstData[n] = static_cast<uint8_t>(std::min<int>(pSrcData[n], 255));
    2659        5062 :         }
    2660             :     }
    2661             :     else
    2662             :     {
    2663         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2664             :                               nDstPixelStride, nWordCount);
    2665             :     }
    2666        6037 : }
    2667             : 
    2668             : template <>
    2669          46 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2670             :                                  int nSrcPixelStride,
    2671             :                                  int16_t *const CPL_RESTRICT pDstData,
    2672             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2673             : {
    2674          46 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2675             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2676             :     {
    2677          40 :         decltype(nWordCount) n = 0;
    2678             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    2679             :         const __m128i xmm_MAX_INT16 = _mm_set1_epi16(32767);
    2680             : #else
    2681             :         // In SSE2, min_epu16 does not exist, so shift from
    2682             :         // UInt16 to SInt16 to be able to use min_epi16
    2683          40 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2684          40 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2685             : #endif
    2686         169 :         for (; n < nWordCount - 15; n += 16)
    2687             :         {
    2688         258 :             __m128i xmm0 = _mm_loadu_si128(
    2689         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2690         129 :             __m128i xmm1 = _mm_loadu_si128(
    2691         129 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2692             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    2693             :             xmm0 = _mm_min_epu16(xmm0, xmm_MAX_INT16);
    2694             :             xmm1 = _mm_min_epu16(xmm1, xmm_MAX_INT16);
    2695             : #else
    2696         129 :             xmm0 = _mm_add_epi16(xmm0, xmm_UINT16_to_INT16);
    2697         129 :             xmm1 = _mm_add_epi16(xmm1, xmm_UINT16_to_INT16);
    2698         129 :             xmm0 = _mm_min_epi16(xmm0, xmm_32767_shifted);
    2699         129 :             xmm1 = _mm_min_epi16(xmm1, xmm_32767_shifted);
    2700         129 :             xmm0 = _mm_sub_epi16(xmm0, xmm_UINT16_to_INT16);
    2701         129 :             xmm1 = _mm_sub_epi16(xmm1, xmm_UINT16_to_INT16);
    2702             : #endif
    2703         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2704         129 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2705             :                              xmm1);
    2706             :         }
    2707             : #if defined(__clang__)
    2708             : #pragma clang loop vectorize(disable)
    2709             : #endif
    2710         191 :         for (; n < nWordCount; n++)
    2711             :         {
    2712         151 :             pDstData[n] =
    2713         151 :                 static_cast<int16_t>(std::min<int>(pSrcData[n], 32767));
    2714          40 :         }
    2715             :     }
    2716             :     else
    2717             :     {
    2718           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2719             :                               nDstPixelStride, nWordCount);
    2720             :     }
    2721          46 : }
    2722             : 
    2723             : template <>
    2724         136 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2725             :                                  int nSrcPixelStride,
    2726             :                                  uint16_t *const CPL_RESTRICT pDstData,
    2727             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2728             : {
    2729         136 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2730             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2731             :     {
    2732          93 :         decltype(nWordCount) n = 0;
    2733          93 :         const __m128i xmm_zero = _mm_setzero_si128();
    2734         278 :         for (; n < nWordCount - 15; n += 16)
    2735             :         {
    2736         370 :             __m128i xmm0 = _mm_loadu_si128(
    2737         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2738         185 :             __m128i xmm1 = _mm_loadu_si128(
    2739         185 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    2740         185 :             xmm0 = _mm_max_epi16(xmm0, xmm_zero);
    2741         185 :             xmm1 = _mm_max_epi16(xmm1, xmm_zero);
    2742         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2743         185 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    2744             :                              xmm1);
    2745             :         }
    2746             : #if defined(__clang__)
    2747             : #pragma clang loop vectorize(disable)
    2748             : #endif
    2749         471 :         for (; n < nWordCount; n++)
    2750             :         {
    2751         378 :             pDstData[n] = static_cast<uint16_t>(std::max<int>(pSrcData[n], 0));
    2752          93 :         }
    2753             :     }
    2754             :     else
    2755             :     {
    2756          43 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2757             :                               nDstPixelStride, nWordCount);
    2758             :     }
    2759         136 : }
    2760             : 
    2761             : template <>
    2762        3150 : CPL_NOINLINE void GDALCopyWordsT(const uint32_t *const CPL_RESTRICT pSrcData,
    2763             :                                  int nSrcPixelStride,
    2764             :                                  int32_t *const CPL_RESTRICT pDstData,
    2765             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2766             : {
    2767        3150 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2768             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2769             :     {
    2770        2196 :         decltype(nWordCount) n = 0;
    2771        2196 :         const __m128i xmm_MAX_INT = _mm_set1_epi32(INT_MAX);
    2772        2196 :         [[maybe_unused]] const __m128i bias = _mm_set1_epi32(INT_MIN);
    2773             :         [[maybe_unused]] const __m128i xmm_MAX_INT_biased =
    2774        2196 :             _mm_xor_si128(xmm_MAX_INT, bias);
    2775       45597 :         for (; n < nWordCount - 7; n += 8)
    2776             :         {
    2777       86802 :             __m128i xmm0 = _mm_loadu_si128(
    2778       43401 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2779       43401 :             __m128i xmm1 = _mm_loadu_si128(
    2780       43401 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2781             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    2782             :             xmm0 = _mm_min_epu32(xmm0, xmm_MAX_INT);
    2783             :             xmm1 = _mm_min_epu32(xmm1, xmm_MAX_INT);
    2784             : #else
    2785       43401 :             const __m128i xmm0_biased = _mm_xor_si128(xmm0, bias);
    2786             :             const __m128i mask0 =
    2787       43401 :                 _mm_cmplt_epi32(xmm0_biased, xmm_MAX_INT_biased);
    2788       43401 :             xmm0 = GDALIfThenElse(mask0, xmm0, xmm_MAX_INT);
    2789             : 
    2790       43401 :             const __m128i xmm1_biased = _mm_xor_si128(xmm1, bias);
    2791             :             const __m128i mask1 =
    2792       43401 :                 _mm_cmplt_epi32(xmm1_biased, xmm_MAX_INT_biased);
    2793       43401 :             xmm1 = GDALIfThenElse(mask1, xmm1, xmm_MAX_INT);
    2794             : #endif
    2795       43401 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2796       43401 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2797             :                              xmm1);
    2798             :         }
    2799             : #if defined(__clang__)
    2800             : #pragma clang loop vectorize(disable)
    2801             : #endif
    2802        9390 :         for (; n < nWordCount; n++)
    2803             :         {
    2804        7194 :             pDstData[n] =
    2805        7194 :                 static_cast<int32_t>(std::min<uint32_t>(pSrcData[n], INT_MAX));
    2806        2196 :         }
    2807             :     }
    2808             :     else
    2809             :     {
    2810         954 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2811             :                               nDstPixelStride, nWordCount);
    2812             :     }
    2813        3150 : }
    2814             : 
    2815             : template <>
    2816          93 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    2817             :                                  int nSrcPixelStride,
    2818             :                                  uint32_t *const CPL_RESTRICT pDstData,
    2819             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2820             : {
    2821          93 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2822             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2823             :     {
    2824          38 :         decltype(nWordCount) n = 0;
    2825          38 :         const __m128i xmm_zero = _mm_setzero_si128();
    2826         333 :         for (; n < nWordCount - 7; n += 8)
    2827             :         {
    2828         590 :             __m128i xmm0 = _mm_loadu_si128(
    2829         295 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2830         295 :             __m128i xmm1 = _mm_loadu_si128(
    2831         295 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    2832             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    2833             :             xmm0 = _mm_max_epi32(xmm0, xmm_zero);
    2834             :             xmm1 = _mm_max_epi32(xmm1, xmm_zero);
    2835             : #else
    2836         295 :             const __m128i mask0 = _mm_cmpgt_epi32(xmm0, xmm_zero);
    2837         295 :             const __m128i mask1 = _mm_cmpgt_epi32(xmm1, xmm_zero);
    2838         295 :             xmm0 = _mm_and_si128(xmm0, mask0);
    2839         295 :             xmm1 = _mm_and_si128(xmm1, mask1);
    2840             : #endif
    2841         295 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm0);
    2842         295 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 4),
    2843             :                              xmm1);
    2844             :         }
    2845             : #if defined(__clang__)
    2846             : #pragma clang loop vectorize(disable)
    2847             : #endif
    2848         192 :         for (; n < nWordCount; n++)
    2849             :         {
    2850         154 :             pDstData[n] = static_cast<uint32_t>(std::max(pSrcData[n], 0));
    2851          38 :         }
    2852             :     }
    2853             :     else
    2854             :     {
    2855          55 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2856             :                               nDstPixelStride, nWordCount);
    2857             :     }
    2858          93 : }
    2859             : 
    2860             : template <>
    2861         403 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2862             :                                  int nSrcPixelStride,
    2863             :                                  float *const CPL_RESTRICT pDstData,
    2864             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2865             : {
    2866         403 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2867             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2868             :     {
    2869         397 :         decltype(nWordCount) n = 0;
    2870         397 :         const __m128i xmm_zero = _mm_setzero_si128();
    2871         397 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2872             :             reinterpret_cast<GByte *>(pDstData);
    2873        1688 :         for (; n < nWordCount - 7; n += 8)
    2874             :         {
    2875        1291 :             __m128i xmm = _mm_loadu_si128(
    2876        1291 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2877        1291 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2878        1291 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2879        1291 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2880        1291 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2881        1291 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2882             :                           xmm0_f);
    2883             :             _mm_storeu_ps(
    2884        1291 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2885             :         }
    2886             : #if defined(__clang__)
    2887             : #pragma clang loop vectorize(disable)
    2888             : #endif
    2889        1415 :         for (; n < nWordCount; n++)
    2890             :         {
    2891        1018 :             pDstData[n] = pSrcData[n];
    2892         397 :         }
    2893             :     }
    2894             :     else
    2895             :     {
    2896           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2897             :                               nDstPixelStride, nWordCount);
    2898             :     }
    2899         403 : }
    2900             : 
    2901             : template <>
    2902     1076640 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2903             :                                  int nSrcPixelStride,
    2904             :                                  float *const CPL_RESTRICT pDstData,
    2905             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2906             : {
    2907     1076640 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2908             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2909             :     {
    2910       86742 :         decltype(nWordCount) n = 0;
    2911       86742 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2912             :             reinterpret_cast<GByte *>(pDstData);
    2913      586119 :         for (; n < nWordCount - 7; n += 8)
    2914             :         {
    2915      499377 :             __m128i xmm = _mm_loadu_si128(
    2916      499377 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2917      499377 :             const auto sign = _mm_srai_epi16(xmm, 15);
    2918      499377 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    2919      499377 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    2920      499377 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2921      499377 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2922      499377 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2923             :                           xmm0_f);
    2924             :             _mm_storeu_ps(
    2925      499377 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2926             :         }
    2927             : #if defined(__clang__)
    2928             : #pragma clang loop vectorize(disable)
    2929             : #endif
    2930      253882 :         for (; n < nWordCount; n++)
    2931             :         {
    2932      167140 :             pDstData[n] = pSrcData[n];
    2933       86742 :         }
    2934             :     }
    2935             :     else
    2936             :     {
    2937      989901 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2938             :                               nDstPixelStride, nWordCount);
    2939             :     }
    2940     1076640 : }
    2941             : 
    2942             : template <>
    2943         449 : CPL_NOINLINE void GDALCopyWordsT(const uint16_t *const CPL_RESTRICT pSrcData,
    2944             :                                  int nSrcPixelStride,
    2945             :                                  double *const CPL_RESTRICT pDstData,
    2946             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    2947             : {
    2948         449 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2949             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2950             :     {
    2951         313 :         decltype(nWordCount) n = 0;
    2952         313 :         const __m128i xmm_zero = _mm_setzero_si128();
    2953         313 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2954             :             reinterpret_cast<GByte *>(pDstData);
    2955         829 :         for (; n < nWordCount - 7; n += 8)
    2956             :         {
    2957         516 :             __m128i xmm = _mm_loadu_si128(
    2958         516 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2959         516 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2960         516 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2961             : 
    2962         516 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2963         516 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2964         516 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2965         516 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2966         516 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2967         516 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2968             : 
    2969         516 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2970             :                           xmm0_low_d);
    2971             :             _mm_storeu_pd(
    2972         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2973             :                 xmm0_high_d);
    2974             :             _mm_storeu_pd(
    2975         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2976             :                 xmm1_low_d);
    2977             :             _mm_storeu_pd(
    2978         516 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2979             :                 xmm1_high_d);
    2980             :         }
    2981             : #if defined(__clang__)
    2982             : #pragma clang loop vectorize(disable)
    2983             : #endif
    2984        1082 :         for (; n < nWordCount; n++)
    2985             :         {
    2986         769 :             pDstData[n] = pSrcData[n];
    2987         313 :         }
    2988             :     }
    2989             :     else
    2990             :     {
    2991         136 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2992             :                               nDstPixelStride, nWordCount);
    2993             :     }
    2994         449 : }
    2995             : 
    2996             : template <>
    2997     4923280 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    2998             :                                  int nSrcPixelStride,
    2999             :                                  double *const CPL_RESTRICT pDstData,
    3000             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3001             : {
    3002     4923280 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3003             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3004             :     {
    3005       34874 :         decltype(nWordCount) n = 0;
    3006       34874 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    3007             :             reinterpret_cast<GByte *>(pDstData);
    3008      403828 :         for (; n < nWordCount - 7; n += 8)
    3009             :         {
    3010      368954 :             __m128i xmm = _mm_loadu_si128(
    3011      368954 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3012      368954 :             const auto sign = _mm_srai_epi16(xmm, 15);
    3013      368954 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, sign);
    3014      368954 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, sign);
    3015             : 
    3016      368954 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    3017      368954 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    3018      368954 :             xmm0 = _mm_srli_si128(xmm0, 8);
    3019      368954 :             xmm1 = _mm_srli_si128(xmm1, 8);
    3020      368954 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    3021      368954 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    3022             : 
    3023      368954 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    3024             :                           xmm0_low_d);
    3025             :             _mm_storeu_pd(
    3026      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    3027             :                 xmm0_high_d);
    3028             :             _mm_storeu_pd(
    3029      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    3030             :                 xmm1_low_d);
    3031             :             _mm_storeu_pd(
    3032      368954 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    3033             :                 xmm1_high_d);
    3034             :         }
    3035             : #if defined(__clang__)
    3036             : #pragma clang loop vectorize(disable)
    3037             : #endif
    3038      255934 :         for (; n < nWordCount; n++)
    3039             :         {
    3040      221060 :             pDstData[n] = pSrcData[n];
    3041       34874 :         }
    3042             :     }
    3043             :     else
    3044             :     {
    3045     4888400 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3046             :                               nDstPixelStride, nWordCount);
    3047             :     }
    3048     4923280 : }
    3049             : 
    3050             : // ---- AVX2 helpers for int32 narrowing (runtime dispatch) ----
    3051             : 
    3052             : #if defined(HAVE_AVX2_DISPATCH)
    3053             : #if !defined(_MSC_VER)
    3054             : __attribute__((target("avx2")))
    3055             : #endif
    3056       12723 : static void GDALCopyWordsInt32ToUInt8_AVX2(const int32_t *CPL_RESTRICT pSrc,
    3057             :                                            uint8_t *CPL_RESTRICT pDst,
    3058             :                                            GPtrDiff_t nWordCount)
    3059             : {
    3060       12723 :     const __m256i permuteIdx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
    3061       12723 :     GPtrDiff_t n = 0;
    3062      958119 :     for (; n < nWordCount - 31; n += 32)
    3063             :     {
    3064             :         __m256i v0 =
    3065      945396 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
    3066             :         __m256i v1 =
    3067      945396 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
    3068      945396 :         __m256i v2 = _mm256_loadu_si256(
    3069      945396 :             reinterpret_cast<const __m256i *>(pSrc + n + 16));
    3070      945396 :         __m256i v3 = _mm256_loadu_si256(
    3071      945396 :             reinterpret_cast<const __m256i *>(pSrc + n + 24));
    3072             :         // Clamp to [0, 255]
    3073             :         // Pack int32 -> int16 -> uint8, then fix cross-lane ordering
    3074      945396 :         __m256i ab16 = _mm256_packs_epi32(v0, v1);
    3075      945396 :         __m256i cd16 = _mm256_packs_epi32(v2, v3);
    3076      945396 :         __m256i bytes = _mm256_packus_epi16(ab16, cd16);
    3077      945396 :         bytes = _mm256_permutevar8x32_epi32(bytes, permuteIdx);
    3078      945396 :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), bytes);
    3079             :     }
    3080             : #if defined(__clang__)
    3081             : #pragma clang loop vectorize(disable)
    3082             : #endif
    3083       68589 :     for (; n < nWordCount; n++)
    3084             :     {
    3085       55866 :         pDst[n] = static_cast<uint8_t>(std::clamp(pSrc[n], 0, 255));
    3086             :     }
    3087       12723 : }
    3088             : 
    3089             : #if !defined(_MSC_VER)
    3090             : __attribute__((target("avx2")))
    3091             : #endif
    3092       10277 : static void GDALCopyWordsInt32ToUInt16_AVX2(const int32_t *CPL_RESTRICT pSrc,
    3093             :                                             uint16_t *CPL_RESTRICT pDst,
    3094             :                                             GPtrDiff_t nWordCount)
    3095             : {
    3096             :     // _mm256_packus_epi32(v0, v1) produces per-lane interleaved result:
    3097             :     //   [v0_lo4, v1_lo4, v0_hi4, v1_hi4] (in uint16 pairs per 32-bit lane)
    3098             :     // Permute to deinterleave: all v0 values first, then all v1 values
    3099       10277 :     const __m256i permuteIdx = _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7);
    3100       10277 :     GPtrDiff_t n = 0;
    3101      670572 :     for (; n < nWordCount - 15; n += 16)
    3102             :     {
    3103             :         __m256i v0 =
    3104      660295 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n));
    3105             :         __m256i v1 =
    3106     1320590 :             _mm256_loadu_si256(reinterpret_cast<const __m256i *>(pSrc + n + 8));
    3107             :         // Clamp to [0, 65535]: _mm256_packus_epi32 saturates uint
    3108      660295 :         __m256i packed = _mm256_packus_epi32(v0, v1);
    3109             :         // Fix cross-lane interleave from packus
    3110      660295 :         packed = _mm256_permutevar8x32_epi32(packed, permuteIdx);
    3111      660295 :         _mm256_storeu_si256(reinterpret_cast<__m256i *>(pDst + n), packed);
    3112             :     }
    3113             : #if defined(__clang__)
    3114             : #pragma clang loop vectorize(disable)
    3115             : #endif
    3116      163928 :     for (; n < nWordCount; n++)
    3117             :     {
    3118      153651 :         pDst[n] = static_cast<uint16_t>(std::clamp(pSrc[n], 0, 65535));
    3119             :     }
    3120       10277 : }
    3121             : #endif  // HAVE_AVX2_DISPATCH
    3122             : 
    3123             : // ---- int32 -> uint8 with clamping to [0, 255] ----
    3124             : template <>
    3125       12837 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3126             :                                  int nSrcPixelStride,
    3127             :                                  uint8_t *const CPL_RESTRICT pDstData,
    3128             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3129             : {
    3130       12837 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3131             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3132             :     {
    3133             : #if defined(HAVE_AVX2_DISPATCH)
    3134       12723 :         if (CPLHaveRuntimeAVX2())
    3135             :         {
    3136       12723 :             GDALCopyWordsInt32ToUInt8_AVX2(pSrcData, pDstData, nWordCount);
    3137       12723 :             return;
    3138             :         }
    3139             : #endif
    3140             : 
    3141             :         // SSE2 path: 16 pixels per iteration
    3142           0 :         decltype(nWordCount) n = 0;
    3143           0 :         for (; n < nWordCount - 15; n += 16)
    3144             :         {
    3145           0 :             __m128i v0 = _mm_loadu_si128(
    3146           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3147           0 :             __m128i v1 = _mm_loadu_si128(
    3148           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3149           0 :             __m128i v2 = _mm_loadu_si128(
    3150           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    3151           0 :             __m128i v3 = _mm_loadu_si128(
    3152           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 12));
    3153             :             // Pack int32->int16 with signed saturation to [-32768,32767] range
    3154           0 :             __m128i lo16 = _mm_packs_epi32(v0, v1);
    3155           0 :             __m128i hi16 = _mm_packs_epi32(v2, v3);
    3156             :             // Pack int16->uint8 with unsigned saturation to [0,255] range
    3157           0 :             __m128i bytes = _mm_packus_epi16(lo16, hi16);
    3158           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), bytes);
    3159             :         }
    3160             : #if defined(__clang__)
    3161             : #pragma clang loop vectorize(disable)
    3162             : #endif
    3163           0 :         for (; n < nWordCount; n++)
    3164             :         {
    3165           0 :             pDstData[n] = static_cast<uint8_t>(std::clamp(pSrcData[n], 0, 255));
    3166           0 :         }
    3167             :     }
    3168             :     else
    3169             :     {
    3170         114 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3171             :                               nDstPixelStride, nWordCount);
    3172             :     }
    3173             : }
    3174             : 
    3175             : // ---- int32 -> uint16 with clamping to [0, 65535] ----
    3176             : template <>
    3177       10322 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3178             :                                  int nSrcPixelStride,
    3179             :                                  uint16_t *const CPL_RESTRICT pDstData,
    3180             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3181             : {
    3182       10322 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3183             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3184             :     {
    3185             : #if defined(HAVE_AVX2_DISPATCH)
    3186       10277 :         if (CPLHaveRuntimeAVX2())
    3187             :         {
    3188       10277 :             GDALCopyWordsInt32ToUInt16_AVX2(pSrcData, pDstData, nWordCount);
    3189       10277 :             return;
    3190             :         }
    3191             : #endif
    3192           0 :         decltype(nWordCount) n = 0;
    3193           0 :         for (; n < nWordCount - 15; n += 16)
    3194             :         {
    3195           0 :             __m128i v0 = _mm_loadu_si128(
    3196           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3197           0 :             __m128i v1 = _mm_loadu_si128(
    3198           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3199           0 :             __m128i v2 = _mm_loadu_si128(
    3200           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    3201           0 :             __m128i v3 = _mm_loadu_si128(
    3202           0 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 12));
    3203           0 :             const auto packed_lo = GDAL_mm_packus_epi32(v0, v1);
    3204           0 :             const auto packed_hi = GDAL_mm_packus_epi32(v2, v3);
    3205           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
    3206             :                              packed_lo);
    3207           0 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    3208             :                              packed_hi);
    3209             :         }
    3210             : #if defined(__clang__)
    3211             : #pragma clang loop vectorize(disable)
    3212             : #endif
    3213           0 :         for (; n < nWordCount; n++)
    3214             :         {
    3215           0 :             pDstData[n] =
    3216           0 :                 static_cast<uint16_t>(std::clamp(pSrcData[n], 0, 65535));
    3217           0 :         }
    3218             :     }
    3219             :     else
    3220             :     {
    3221          45 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3222             :                               nDstPixelStride, nWordCount);
    3223             :     }
    3224             : }
    3225             : 
    3226             : // ---- int32 -> int16 with clamping to [-32768, 32767] ----
    3227             : template <>
    3228          98 : CPL_NOINLINE void GDALCopyWordsT(const int32_t *const CPL_RESTRICT pSrcData,
    3229             :                                  int nSrcPixelStride,
    3230             :                                  int16_t *const CPL_RESTRICT pDstData,
    3231             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3232             : {
    3233          98 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3234             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3235             :     {
    3236             :         // SSE2 path: 16 pixels per iteration
    3237          43 :         decltype(nWordCount) n = 0;
    3238         268 :         for (; n < nWordCount - 15; n += 16)
    3239             :         {
    3240         450 :             __m128i v0 = _mm_loadu_si128(
    3241         225 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3242         450 :             __m128i v1 = _mm_loadu_si128(
    3243         225 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 4));
    3244         450 :             __m128i v2 = _mm_loadu_si128(
    3245         225 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    3246         225 :             __m128i v3 = _mm_loadu_si128(
    3247         225 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 12));
    3248             :             // Pack int32->int16 with signed saturation to [-32768,32767] range
    3249         225 :             __m128i packed_lo = _mm_packs_epi32(v0, v1);
    3250         225 :             __m128i packed_hi = _mm_packs_epi32(v2, v3);
    3251         225 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
    3252             :                              packed_lo);
    3253         225 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 8),
    3254             :                              packed_hi);
    3255             :         }
    3256             : #if defined(__clang__)
    3257             : #pragma clang loop vectorize(disable)
    3258             : #endif
    3259         191 :         for (; n < nWordCount; n++)
    3260             :         {
    3261         148 :             pDstData[n] =
    3262         148 :                 static_cast<int16_t>(std::clamp(pSrcData[n], -32768, 32767));
    3263          43 :         }
    3264             :     }
    3265             :     else
    3266             :     {
    3267          55 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3268             :                               nDstPixelStride, nWordCount);
    3269             :     }
    3270          98 : }
    3271             : 
    3272             : // ---- int16 -> uint8 with clamping to [0, 255] ----
    3273             : template <>
    3274       17428 : CPL_NOINLINE void GDALCopyWordsT(const int16_t *const CPL_RESTRICT pSrcData,
    3275             :                                  int nSrcPixelStride,
    3276             :                                  uint8_t *const CPL_RESTRICT pDstData,
    3277             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3278             : {
    3279       17428 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    3280             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    3281             :     {
    3282             :         // SSE2 path: 32 pixels per iteration
    3283       17338 :         decltype(nWordCount) n = 0;
    3284       85649 :         for (; n < nWordCount - 31; n += 32)
    3285             :         {
    3286      136622 :             __m128i v0 = _mm_loadu_si128(
    3287       68311 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    3288      136622 :             __m128i v1 = _mm_loadu_si128(
    3289       68311 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 8));
    3290      136622 :             __m128i v2 = _mm_loadu_si128(
    3291       68311 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 16));
    3292       68311 :             __m128i v3 = _mm_loadu_si128(
    3293       68311 :                 reinterpret_cast<const __m128i *>(pSrcData + n + 24));
    3294             :             // Pack int16->uint8 with unsigned saturation to [0, 255] range
    3295       68311 :             __m128i packed_lo = _mm_packus_epi16(v0, v1);
    3296       68311 :             __m128i packed_hi = _mm_packus_epi16(v2, v3);
    3297       68311 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n),
    3298             :                              packed_lo);
    3299       68311 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n + 16),
    3300             :                              packed_hi);
    3301             :         }
    3302             : #if defined(__clang__)
    3303             : #pragma clang loop vectorize(disable)
    3304             : #endif
    3305      214741 :         for (; n < nWordCount; n++)
    3306             :         {
    3307      197403 :             pDstData[n] =
    3308      197403 :                 static_cast<uint8_t>(std::clamp<int>(pSrcData[n], 0, 255));
    3309       17338 :         }
    3310             :     }
    3311             :     else
    3312             :     {
    3313          90 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    3314             :                               nDstPixelStride, nWordCount);
    3315             :     }
    3316       17428 : }
    3317             : 
    3318             : #endif  // HAVE_SSE2
    3319             : 
    3320             : template <>
    3321     4437410 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3322             :                                  int nSrcPixelStride,
    3323             :                                  GByte *const CPL_RESTRICT pDstData,
    3324             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3325             : {
    3326     4437410 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3327             :                             nDstPixelStride, nWordCount);
    3328     4437410 : }
    3329             : 
    3330             : template <>
    3331       38470 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3332             :                                  int nSrcPixelStride,
    3333             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3334             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3335             : {
    3336       38470 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3337             :                             nDstPixelStride, nWordCount);
    3338       38470 : }
    3339             : 
    3340             : template <>
    3341       56851 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3342             :                                  int nSrcPixelStride,
    3343             :                                  double *const CPL_RESTRICT pDstData,
    3344             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3345             : {
    3346       56851 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3347             :                             nDstPixelStride, nWordCount);
    3348       56851 : }
    3349             : 
    3350             : template <>
    3351      122862 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3352             :                                  int nSrcPixelStride,
    3353             :                                  float *const CPL_RESTRICT pDstData,
    3354             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3355             : {
    3356      122862 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3357             :                             nDstPixelStride, nWordCount);
    3358      122862 : }
    3359             : 
    3360             : template <>
    3361         412 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3362             :                                  int nSrcPixelStride,
    3363             :                                  float *const CPL_RESTRICT pDstData,
    3364             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3365             : {
    3366         412 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3367             :                             nDstPixelStride, nWordCount);
    3368         412 : }
    3369             : 
    3370             : template <>
    3371         544 : CPL_NOINLINE void GDALCopyWordsT(const GFloat16 *const CPL_RESTRICT pSrcData,
    3372             :                                  int nSrcPixelStride,
    3373             :                                  double *const CPL_RESTRICT pDstData,
    3374             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3375             : {
    3376         544 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3377             :                             nDstPixelStride, nWordCount);
    3378         544 : }
    3379             : 
    3380             : template <>
    3381      327359 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3382             :                                  int nSrcPixelStride,
    3383             :                                  GByte *const CPL_RESTRICT pDstData,
    3384             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3385             : {
    3386      327359 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3387             :                             nDstPixelStride, nWordCount);
    3388      327359 : }
    3389             : 
    3390             : template <>
    3391          61 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3392             :                                  int nSrcPixelStride,
    3393             :                                  GInt8 *const CPL_RESTRICT pDstData,
    3394             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3395             : {
    3396          61 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3397             :                             nDstPixelStride, nWordCount);
    3398          61 : }
    3399             : 
    3400             : template <>
    3401       15791 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3402             :                                  int nSrcPixelStride,
    3403             :                                  GInt16 *const CPL_RESTRICT pDstData,
    3404             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3405             : {
    3406       15791 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3407             :                             nDstPixelStride, nWordCount);
    3408       15791 : }
    3409             : 
    3410             : template <>
    3411       61719 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3412             :                                  int nSrcPixelStride,
    3413             :                                  GUInt16 *const CPL_RESTRICT pDstData,
    3414             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3415             : {
    3416       61719 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3417             :                             nDstPixelStride, nWordCount);
    3418       61719 : }
    3419             : 
    3420             : template <>
    3421       43991 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3422             :                                  int nSrcPixelStride,
    3423             :                                  GInt32 *const CPL_RESTRICT pDstData,
    3424             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3425             : {
    3426       43991 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3427             :                             nDstPixelStride, nWordCount);
    3428       43991 : }
    3429             : 
    3430             : template <>
    3431          72 : CPL_NOINLINE void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    3432             :                                  int nSrcPixelStride,
    3433             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3434             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3435             : {
    3436          72 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3437             :                             nDstPixelStride, nWordCount);
    3438          72 : }
    3439             : 
    3440             : template <>
    3441          63 : CPL_NOINLINE void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    3442             :                                  int nSrcPixelStride,
    3443             :                                  GFloat16 *const CPL_RESTRICT pDstData,
    3444             :                                  int nDstPixelStride, GPtrDiff_t nWordCount)
    3445             : {
    3446          63 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    3447             :                             nDstPixelStride, nWordCount);
    3448          63 : }
    3449             : 
    3450             : /************************************************************************/
    3451             : /*                       GDALCopyWordsComplexT()                        */
    3452             : /************************************************************************/
    3453             : /**
    3454             :  * Template function, used to copy data from pSrcData into buffer
    3455             :  * pDstData, with stride nSrcPixelStride in the source data and
    3456             :  * stride nDstPixelStride in the destination data. Deals with the
    3457             :  * complex case, where input is complex and output is complex.
    3458             :  *
    3459             :  * @param pSrcData the source data buffer
    3460             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3461             :  *                      of interest.
    3462             :  * @param pDstData the destination buffer.
    3463             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3464             :  *                      interest.
    3465             :  * @param nWordCount the total number of pixel words to copy
    3466             :  *
    3467             :  */
    3468             : template <class Tin, class Tout>
    3469       98788 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    3470             :                                   int nSrcPixelStride,
    3471             :                                   Tout *const CPL_RESTRICT pDstData,
    3472             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    3473             : {
    3474       98788 :     decltype(nWordCount) nDstOffset = 0;
    3475       98788 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3476       98788 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3477             : 
    3478     5631239 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3479             :     {
    3480     5532446 :         const Tin *const pPixelIn =
    3481     5532446 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3482     5532446 :         Tout *const pPixelOut =
    3483     5532446 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3484             : 
    3485     5532446 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    3486     5532446 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    3487             : 
    3488     5532446 :         nDstOffset += nDstPixelStride;
    3489             :     }
    3490       98788 : }
    3491             : 
    3492             : /************************************************************************/
    3493             : /*                      GDALCopyWordsComplexOutT()                      */
    3494             : /************************************************************************/
    3495             : /**
    3496             :  * Template function, used to copy data from pSrcData into buffer
    3497             :  * pDstData, with stride nSrcPixelStride in the source data and
    3498             :  * stride nDstPixelStride in the destination data. Deals with the
    3499             :  * case where the value is real coming in, but complex going out.
    3500             :  *
    3501             :  * @param pSrcData the source data buffer
    3502             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    3503             :  *                      of interest, in bytes.
    3504             :  * @param pDstData the destination buffer.
    3505             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    3506             :  *                      interest, in bytes.
    3507             :  * @param nWordCount the total number of pixel words to copy
    3508             :  *
    3509             :  */
    3510             : template <class Tin, class Tout>
    3511        4778 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    3512             :                                      int nSrcPixelStride,
    3513             :                                      Tout *const CPL_RESTRICT pDstData,
    3514             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    3515             : {
    3516        4778 :     decltype(nWordCount) nDstOffset = 0;
    3517             : 
    3518        4778 :     const Tout tOutZero = static_cast<Tout>(0);
    3519             : 
    3520        4778 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    3521        4778 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    3522             : 
    3523     1190456 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    3524             :     {
    3525     1185678 :         const Tin tValue =
    3526     1185678 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    3527     1185678 :         Tout *const pPixelOut =
    3528     1185678 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    3529     1185678 :         GDALCopyWord(tValue, *pPixelOut);
    3530             : 
    3531     1185678 :         pPixelOut[1] = tOutZero;
    3532             : 
    3533     1185678 :         nDstOffset += nDstPixelStride;
    3534             :     }
    3535        4778 : }
    3536             : 
    3537             : /************************************************************************/
    3538             : /*                         GDALCopyWordsFromT()                         */
    3539             : /************************************************************************/
    3540             : /**
    3541             :  * Template driver function. Given the input type T, call the appropriate
    3542             :  * GDALCopyWordsT function template for the desired output type. You should
    3543             :  * never call this function directly (call GDALCopyWords instead).
    3544             :  *
    3545             :  * @param pSrcData source data buffer
    3546             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    3547             :  * @param bInComplex input is complex
    3548             :  * @param pDstData destination data buffer
    3549             :  * @param eDstType destination data type
    3550             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    3551             :  * @param nWordCount number of pixel words to be copied
    3552             :  */
    3553             : template <class T>
    3554    61318891 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    3555             :                                int nSrcPixelStride, bool bInComplex,
    3556             :                                void *CPL_RESTRICT pDstData,
    3557             :                                GDALDataType eDstType, int nDstPixelStride,
    3558             :                                GPtrDiff_t nWordCount)
    3559             : {
    3560    61318891 :     switch (eDstType)
    3561             :     {
    3562     4808915 :         case GDT_UInt8:
    3563     4808915 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3564             :                            static_cast<unsigned char *>(pDstData),
    3565             :                            nDstPixelStride, nWordCount);
    3566     4808915 :             break;
    3567        1903 :         case GDT_Int8:
    3568        1903 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3569             :                            static_cast<signed char *>(pDstData),
    3570             :                            nDstPixelStride, nWordCount);
    3571        1903 :             break;
    3572     1143867 :         case GDT_UInt16:
    3573     1143867 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3574             :                            static_cast<unsigned short *>(pDstData),
    3575             :                            nDstPixelStride, nWordCount);
    3576     1143867 :             break;
    3577     4162744 :         case GDT_Int16:
    3578     4162744 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3579             :                            static_cast<short *>(pDstData), nDstPixelStride,
    3580             :                            nWordCount);
    3581     4162744 :             break;
    3582       23350 :         case GDT_UInt32:
    3583       23350 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3584             :                            static_cast<unsigned int *>(pDstData),
    3585             :                            nDstPixelStride, nWordCount);
    3586       23350 :             break;
    3587    29451281 :         case GDT_Int32:
    3588    29451281 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3589             :                            static_cast<int *>(pDstData), nDstPixelStride,
    3590             :                            nWordCount);
    3591    29451281 :             break;
    3592        1262 :         case GDT_UInt64:
    3593        1262 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3594             :                            static_cast<std::uint64_t *>(pDstData),
    3595             :                            nDstPixelStride, nWordCount);
    3596        1262 :             break;
    3597        6209 :         case GDT_Int64:
    3598        6209 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3599             :                            static_cast<std::int64_t *>(pDstData),
    3600             :                            nDstPixelStride, nWordCount);
    3601        6209 :             break;
    3602         999 :         case GDT_Float16:
    3603         999 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3604             :                            static_cast<GFloat16 *>(pDstData), nDstPixelStride,
    3605             :                            nWordCount);
    3606         999 :             break;
    3607     4216217 :         case GDT_Float32:
    3608     4216217 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3609             :                            static_cast<float *>(pDstData), nDstPixelStride,
    3610             :                            nWordCount);
    3611     4216217 :             break;
    3612    17398515 :         case GDT_Float64:
    3613    17398515 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    3614             :                            static_cast<double *>(pDstData), nDstPixelStride,
    3615             :                            nWordCount);
    3616    17398515 :             break;
    3617       94432 :         case GDT_CInt16:
    3618       94432 :             if (bInComplex)
    3619             :             {
    3620       93170 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3621             :                                       static_cast<short *>(pDstData),
    3622             :                                       nDstPixelStride, nWordCount);
    3623             :             }
    3624             :             else  // input is not complex, so we need to promote to a complex
    3625             :                   // buffer
    3626             :             {
    3627        1262 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3628             :                                          static_cast<short *>(pDstData),
    3629             :                                          nDstPixelStride, nWordCount);
    3630             :             }
    3631       94432 :             break;
    3632        1357 :         case GDT_CInt32:
    3633        1357 :             if (bInComplex)
    3634             :             {
    3635         717 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3636             :                                       static_cast<int *>(pDstData),
    3637             :                                       nDstPixelStride, nWordCount);
    3638             :             }
    3639             :             else  // input is not complex, so we need to promote to a complex
    3640             :                   // buffer
    3641             :             {
    3642         640 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3643             :                                          static_cast<int *>(pDstData),
    3644             :                                          nDstPixelStride, nWordCount);
    3645             :             }
    3646        1357 :             break;
    3647         313 :         case GDT_CFloat16:
    3648         313 :             if (bInComplex)
    3649             :             {
    3650          48 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3651             :                                       static_cast<GFloat16 *>(pDstData),
    3652             :                                       nDstPixelStride, nWordCount);
    3653             :             }
    3654             :             else  // input is not complex, so we need to promote to a complex
    3655             :                   // buffer
    3656             :             {
    3657         265 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3658             :                                          static_cast<GFloat16 *>(pDstData),
    3659             :                                          nDstPixelStride, nWordCount);
    3660             :             }
    3661         313 :             break;
    3662        3924 :         case GDT_CFloat32:
    3663        3924 :             if (bInComplex)
    3664             :             {
    3665        3115 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3666             :                                       static_cast<float *>(pDstData),
    3667             :                                       nDstPixelStride, nWordCount);
    3668             :             }
    3669             :             else  // input is not complex, so we need to promote to a complex
    3670             :                   // buffer
    3671             :             {
    3672         809 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3673             :                                          static_cast<float *>(pDstData),
    3674             :                                          nDstPixelStride, nWordCount);
    3675             :             }
    3676        3924 :             break;
    3677        3540 :         case GDT_CFloat64:
    3678        3540 :             if (bInComplex)
    3679             :             {
    3680        1738 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    3681             :                                       static_cast<double *>(pDstData),
    3682             :                                       nDstPixelStride, nWordCount);
    3683             :             }
    3684             :             else  // input is not complex, so we need to promote to a complex
    3685             :                   // buffer
    3686             :             {
    3687        1802 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    3688             :                                          static_cast<double *>(pDstData),
    3689             :                                          nDstPixelStride, nWordCount);
    3690             :             }
    3691        3540 :             break;
    3692           0 :         case GDT_Unknown:
    3693             :         case GDT_TypeCount:
    3694           0 :             CPLAssert(false);
    3695             :     }
    3696    61318891 : }
    3697             : 
    3698             : }  // end anonymous namespace
    3699             : 
    3700             : /************************************************************************/
    3701             : /*                         GDALReplicateWord()                          */
    3702             : /************************************************************************/
    3703             : 
    3704             : template <class T>
    3705      600542 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    3706             :                                GPtrDiff_t nWordCount)
    3707             : {
    3708      600542 :     const T valSet = *static_cast<const T *>(pDstData);
    3709      600542 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    3710             :     {
    3711      570728 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    3712    32019219 :         while (nWordCount >= 4)
    3713             :         {
    3714    31448490 :             nWordCount -= 4;
    3715    31448490 :             pDstPtr[0] = valSet;
    3716    31448490 :             pDstPtr[1] = valSet;
    3717    31448490 :             pDstPtr[2] = valSet;
    3718    31448490 :             pDstPtr[3] = valSet;
    3719    31448490 :             pDstPtr += 4;
    3720             :         }
    3721     1477071 :         while (nWordCount > 0)
    3722             :         {
    3723      906343 :             --nWordCount;
    3724      906343 :             *pDstPtr = valSet;
    3725      906343 :             pDstPtr++;
    3726             :         }
    3727             :     }
    3728             :     else
    3729             :     {
    3730       29814 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    3731     1106520 :         while (nWordCount > 0)
    3732             :         {
    3733     1076706 :             --nWordCount;
    3734     1076706 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    3735     1076706 :             pabyDstPtr += nDstPixelStride;
    3736             :         }
    3737             :     }
    3738      600542 : }
    3739             : 
    3740     1080550 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    3741             :                               GDALDataType eSrcType,
    3742             :                               void *CPL_RESTRICT pDstData,
    3743             :                               GDALDataType eDstType, int nDstPixelStride,
    3744             :                               GPtrDiff_t nWordCount)
    3745             : {
    3746             :     /* -----------------------------------------------------------------------
    3747             :      */
    3748             :     /* Special case when the source data is always the same value */
    3749             :     /* (for VRTSourcedRasterBand::IRasterIO and
    3750             :      * VRTDerivedRasterBand::IRasterIO*/
    3751             :     /*  for example) */
    3752             :     /* -----------------------------------------------------------------------
    3753             :      */
    3754             :     // Let the general translation case do the necessary conversions
    3755             :     // on the first destination element.
    3756     1080550 :     GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    3757             : 
    3758             :     // Now copy the first element to the nWordCount - 1 following destination
    3759             :     // elements.
    3760     1080550 :     nWordCount--;
    3761     1080550 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    3762             : 
    3763     1080550 :     switch (eDstType)
    3764             :     {
    3765      479917 :         case GDT_UInt8:
    3766             :         case GDT_Int8:
    3767             :         {
    3768      479917 :             if (nDstPixelStride == 1)
    3769             :             {
    3770      369983 :                 if (nWordCount > 0)
    3771      369983 :                     memset(pabyDstWord,
    3772      369983 :                            *reinterpret_cast<const GByte *>(pDstData),
    3773             :                            nWordCount);
    3774             :             }
    3775             :             else
    3776             :             {
    3777      109934 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    3778    72932400 :                 while (nWordCount > 0)
    3779             :                 {
    3780    72822500 :                     --nWordCount;
    3781    72822500 :                     *pabyDstWord = valSet;
    3782    72822500 :                     pabyDstWord += nDstPixelStride;
    3783             :                 }
    3784             :             }
    3785      479917 :             break;
    3786             :         }
    3787             : 
    3788             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    3789             :     case enum_type:                                                            \
    3790             :     {                                                                          \
    3791             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    3792             :         break;                                                                 \
    3793             :     }
    3794             : 
    3795       34588 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    3796      202455 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    3797          74 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3798      301585 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3799          41 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3800        1072 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3801           2 :             CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
    3802       52861 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3803        7864 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3804             : 
    3805             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3806             :     case enum_type:                                                            \
    3807             :     {                                                                          \
    3808             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3809             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3810             :         while (nWordCount > 0)                                                 \
    3811             :         {                                                                      \
    3812             :             --nWordCount;                                                      \
    3813             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3814             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3815             :             pabyDstWord += nDstPixelStride;                                    \
    3816             :         }                                                                      \
    3817             :         break;                                                                 \
    3818             :     }
    3819             : 
    3820         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3821         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3822           6 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
    3823         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3824         790 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3825             : 
    3826           0 :         case GDT_Unknown:
    3827             :         case GDT_TypeCount:
    3828           0 :             CPLAssert(false);
    3829             :     }
    3830     1080550 : }
    3831             : 
    3832             : /************************************************************************/
    3833             : /*                          GDALUnrolledCopy()                          */
    3834             : /************************************************************************/
    3835             : 
    3836             : template <class T, int srcStride, int dstStride>
    3837             : #if defined(__GNUC__) && defined(__AVX2__)
    3838             : __attribute__((optimize("tree-vectorize")))
    3839             : #endif
    3840     3057302 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3841             :                                            const T *CPL_RESTRICT pSrc,
    3842             :                                            GPtrDiff_t nIters)
    3843             : {
    3844             : #if !(defined(__GNUC__) && defined(__AVX2__))
    3845     3057302 :     if (nIters >= 16)
    3846             :     {
    3847   133833870 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3848             :         {
    3849   130897163 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3850   130897163 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3851   130897163 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3852   130897163 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3853   130897163 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3854   130897163 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3855   130897163 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3856   130897163 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3857   130897163 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3858   130897163 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3859   130897163 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3860   130897163 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3861   130897163 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3862   130897163 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3863   130897163 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3864   130897163 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3865   130897163 :             pDest += 16 * dstStride;
    3866   130897163 :             pSrc += 16 * srcStride;
    3867             :         }
    3868     2936718 :         nIters = nIters % 16;
    3869             :     }
    3870             : #else
    3871             : #pragma GCC unroll 4
    3872             : #endif
    3873     5217606 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3874             :     {
    3875     2160307 :         pDest[i * dstStride] = *pSrc;
    3876     2160307 :         pSrc += srcStride;
    3877             :     }
    3878     3057302 : }
    3879             : 
    3880             : template <class T, int srcStride, int dstStride>
    3881     3057302 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3882             :                                     const T *CPL_RESTRICT pSrc,
    3883             :                                     GPtrDiff_t nIters)
    3884             : {
    3885     3057302 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3886     3057302 : }
    3887             : 
    3888             : #if defined(__AVX2__) && defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                \
    3889             :     (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
    3890             : 
    3891             : template <>
    3892             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3893             :                                    const GByte *CPL_RESTRICT pSrc,
    3894             :                                    GPtrDiff_t nIters)
    3895             : {
    3896             :     if (nIters > 16)
    3897             :     {
    3898             :         // The SSSE3 variant is slightly faster than what the gcc autovectorizer
    3899             :         // generates
    3900             :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3901             :     }
    3902             :     else
    3903             :     {
    3904             :         for (GPtrDiff_t i = 0; i < nIters; i++)
    3905             :         {
    3906             :             pDest[i] = *pSrc;
    3907             :             pSrc += 3;
    3908             :         }
    3909             :     }
    3910             : }
    3911             : 
    3912             : #elif defined(HAVE_SSE2) && !(defined(__GNUC__) && defined(__AVX2__))
    3913             : 
    3914             : template <>
    3915      355234 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3916             :                                    const GByte *CPL_RESTRICT pSrc,
    3917             :                                    GPtrDiff_t nIters)
    3918             : {
    3919      355234 :     decltype(nIters) i = 0;
    3920      355234 :     if (nIters > 16)
    3921             :     {
    3922      195697 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3923             :         // If we were sure that there would always be 1 trailing byte, we could
    3924             :         // check against nIters - 15
    3925     3029070 :         for (; i < nIters - 16; i += 16)
    3926             :         {
    3927             :             __m128i xmm0 =
    3928     2833370 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3929             :             __m128i xmm1 =
    3930     5666750 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3931             :             // Set higher 8bit of each int16 packed word to 0
    3932     2833370 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3933     2833370 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3934             :             // Pack int16 to uint8 and merge back both vector
    3935     2833370 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3936             : 
    3937             :             // Store result
    3938     2833370 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3939             : 
    3940     2833370 :             pSrc += 2 * 16;
    3941             :         }
    3942             :     }
    3943     4651400 :     for (; i < nIters; i++)
    3944             :     {
    3945     4296170 :         pDest[i] = *pSrc;
    3946     4296170 :         pSrc += 2;
    3947             :     }
    3948      355234 : }
    3949             : 
    3950           1 : static void GDALUnrolledCopy_GByte_3_1_SSE2(GByte *CPL_RESTRICT pDest,
    3951             :                                             const GByte *CPL_RESTRICT pSrc,
    3952             :                                             GPtrDiff_t nIters)
    3953             : {
    3954           1 :     decltype(nIters) i = 0;
    3955           1 :     const __m128i xmm_mask_ori = _mm_set_epi32(0, 0, 0, 255);
    3956             :     // If we were sure that there would always be 2 trailing bytes, we could
    3957             :     // check against nIters - 15
    3958           2 :     for (; i < nIters - 16; i += 16)
    3959             :     {
    3960             :         __m128i xmm0 =
    3961           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3962             :         __m128i xmm1 =
    3963           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3964             :         __m128i xmm2 =
    3965           1 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3966             : 
    3967           1 :         auto xmm_mask0 = xmm_mask_ori;
    3968           1 :         auto xmm_mask1 = _mm_slli_si128(xmm_mask_ori, 6);
    3969           1 :         auto xmm_mask2 = _mm_slli_si128(xmm_mask_ori, 11);
    3970             : 
    3971           1 :         auto xmm = _mm_and_si128(xmm0, xmm_mask0);
    3972           1 :         auto xmm_res1 = _mm_and_si128(_mm_slli_si128(xmm1, 4), xmm_mask1);
    3973             : 
    3974           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3975           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3976           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3977           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3978           2 :         xmm_res1 = _mm_or_si128(
    3979             :             xmm_res1, _mm_and_si128(_mm_slli_si128(xmm1, 2), xmm_mask1));
    3980             : 
    3981           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3982           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3983           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3984           2 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3985           1 :         xmm_res1 = _mm_or_si128(xmm_res1, _mm_and_si128(xmm1, xmm_mask1));
    3986             : 
    3987           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3988           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3989           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3990           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3991           2 :         xmm_res1 = _mm_or_si128(
    3992             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 2), xmm_mask1));
    3993             : 
    3994           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    3995           1 :         xmm_mask1 = _mm_slli_si128(xmm_mask1, 1);
    3996           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    3997           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    3998           3 :         xmm_res1 = _mm_or_si128(
    3999             :             xmm_res1, _mm_and_si128(_mm_srli_si128(xmm1, 4), xmm_mask1));
    4000           1 :         xmm = _mm_or_si128(xmm, xmm_res1);
    4001             : 
    4002           1 :         xmm_mask0 = _mm_slli_si128(xmm_mask0, 1);
    4003           1 :         xmm0 = _mm_srli_si128(xmm0, 2);
    4004           1 :         xmm = _mm_or_si128(xmm, _mm_and_si128(xmm0, xmm_mask0));
    4005             : 
    4006           2 :         xmm = _mm_or_si128(xmm,
    4007             :                            _mm_and_si128(_mm_slli_si128(xmm2, 10), xmm_mask2));
    4008             : 
    4009           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    4010           2 :         xmm = _mm_or_si128(xmm,
    4011             :                            _mm_and_si128(_mm_slli_si128(xmm2, 8), xmm_mask2));
    4012             : 
    4013           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    4014           2 :         xmm = _mm_or_si128(xmm,
    4015             :                            _mm_and_si128(_mm_slli_si128(xmm2, 6), xmm_mask2));
    4016             : 
    4017           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    4018           2 :         xmm = _mm_or_si128(xmm,
    4019             :                            _mm_and_si128(_mm_slli_si128(xmm2, 4), xmm_mask2));
    4020             : 
    4021           1 :         xmm_mask2 = _mm_slli_si128(xmm_mask2, 1);
    4022           2 :         xmm = _mm_or_si128(xmm,
    4023             :                            _mm_and_si128(_mm_slli_si128(xmm2, 2), xmm_mask2));
    4024             : 
    4025           1 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm);
    4026             : 
    4027           1 :         pSrc += 3 * 16;
    4028             :     }
    4029           2 :     for (; i < nIters; i++)
    4030             :     {
    4031           1 :         pDest[i] = *pSrc;
    4032           1 :         pSrc += 3;
    4033             :     }
    4034           1 : }
    4035             : 
    4036             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    4037             : 
    4038             : template <>
    4039      193575 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    4040             :                                    const GByte *CPL_RESTRICT pSrc,
    4041             :                                    GPtrDiff_t nIters)
    4042             : {
    4043      193575 :     if (nIters > 16)
    4044             :     {
    4045      187452 :         if (CPLHaveRuntimeSSSE3())
    4046             :         {
    4047      187451 :             GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    4048             :         }
    4049             :         else
    4050             :         {
    4051           1 :             GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    4052             :         }
    4053             :     }
    4054             :     else
    4055             :     {
    4056       20384 :         for (GPtrDiff_t i = 0; i < nIters; i++)
    4057             :         {
    4058       14261 :             pDest[i] = *pSrc;
    4059       14261 :             pSrc += 3;
    4060             :         }
    4061             :     }
    4062      193575 : }
    4063             : 
    4064             : #else
    4065             : 
    4066             : template <>
    4067             : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    4068             :                                    const GByte *CPL_RESTRICT pSrc,
    4069             :                                    GPtrDiff_t nIters)
    4070             : {
    4071             :     GDALUnrolledCopy_GByte_3_1_SSE2(pDest, pSrc, nIters);
    4072             : }
    4073             : #endif
    4074             : 
    4075             : template <>
    4076      332697 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    4077             :                                    const GByte *CPL_RESTRICT pSrc,
    4078             :                                    GPtrDiff_t nIters)
    4079             : {
    4080      332697 :     decltype(nIters) i = 0;
    4081      332697 :     if (nIters > 16)
    4082             :     {
    4083      327400 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    4084             :         // If we were sure that there would always be 3 trailing bytes, we could
    4085             :         // check against nIters - 15
    4086    28190900 :         for (; i < nIters - 16; i += 16)
    4087             :         {
    4088             :             __m128i xmm0 =
    4089    27863500 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    4090             :             __m128i xmm1 =
    4091    27863500 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    4092             :             __m128i xmm2 =
    4093    27863500 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    4094             :             __m128i xmm3 =
    4095    55727100 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    4096             :             // Set higher 24bit of each int32 packed word to 0
    4097    27863500 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    4098    27863500 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    4099    27863500 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    4100    27863500 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    4101             :             // Pack int32 to int16
    4102    27863500 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    4103    27863500 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    4104             :             // Pack int16 to uint8
    4105    27863500 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    4106             : 
    4107             :             // Store result
    4108    27863500 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    4109             : 
    4110    27863500 :             pSrc += 4 * 16;
    4111             :         }
    4112             :     }
    4113     5049360 :     for (; i < nIters; i++)
    4114             :     {
    4115     4716660 :         pDest[i] = *pSrc;
    4116     4716660 :         pSrc += 4;
    4117             :     }
    4118      332697 : }
    4119             : #endif  // HAVE_SSE2
    4120             : 
    4121             : /************************************************************************/
    4122             : /*                            GDALFastCopy()                            */
    4123             : /************************************************************************/
    4124             : 
    4125             : template <class T>
    4126    40277700 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    4127             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    4128             :                                 GPtrDiff_t nIters)
    4129             : {
    4130    40277700 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    4131    40277700 :     if (nIters == 1)
    4132             :     {
    4133    22545840 :         *pDest = *pSrc;
    4134             :     }
    4135    17731883 :     else if (nDestStride == sizeofT)
    4136             :     {
    4137    14601400 :         if (nSrcStride == sizeofT)
    4138             :         {
    4139    13510184 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    4140             :         }
    4141     1091240 :         else if (nSrcStride == 2 * sizeofT)
    4142             :         {
    4143      358450 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    4144             :         }
    4145      732790 :         else if (nSrcStride == 3 * sizeofT)
    4146             :         {
    4147      290555 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    4148             :         }
    4149      442235 :         else if (nSrcStride == 4 * sizeofT)
    4150             :         {
    4151      336679 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    4152             :         }
    4153             :         else
    4154             :         {
    4155    17229290 :             while (nIters-- > 0)
    4156             :             {
    4157    17123750 :                 *pDest = *pSrc;
    4158    17123750 :                 pSrc += nSrcStride / sizeofT;
    4159    17123750 :                 pDest++;
    4160             :             }
    4161             :         }
    4162             :     }
    4163     3130433 :     else if (nSrcStride == sizeofT)
    4164             :     {
    4165     3117437 :         if (nDestStride == 2 * sizeofT)
    4166             :         {
    4167      152834 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    4168             :         }
    4169     2964605 :         else if (nDestStride == 3 * sizeofT)
    4170             :         {
    4171     2136181 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    4172             :         }
    4173      828421 :         else if (nDestStride == 4 * sizeofT)
    4174             :         {
    4175      664109 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    4176             :         }
    4177             :         else
    4178             :         {
    4179    17169660 :             while (nIters-- > 0)
    4180             :             {
    4181    17005410 :                 *pDest = *pSrc;
    4182    17005410 :                 pSrc++;
    4183    17005410 :                 pDest += nDestStride / sizeofT;
    4184             :             }
    4185             :         }
    4186             :     }
    4187             :     else
    4188             :     {
    4189     1220108 :         while (nIters-- > 0)
    4190             :         {
    4191     1207102 :             *pDest = *pSrc;
    4192     1207102 :             pSrc += nSrcStride / sizeofT;
    4193     1207102 :             pDest += nDestStride / sizeofT;
    4194             :         }
    4195             :     }
    4196    40277700 : }
    4197             : 
    4198             : /************************************************************************/
    4199             : /*                          GDALFastCopyByte()                          */
    4200             : /************************************************************************/
    4201             : 
    4202      326330 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    4203             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    4204             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    4205             : {
    4206      326330 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    4207             :                  nWordCount);
    4208      326330 : }
    4209             : 
    4210             : /************************************************************************/
    4211             : /*                           GDALCopyWords()                            */
    4212             : /************************************************************************/
    4213             : 
    4214             : /**
    4215             :  * Copy pixel words from buffer to buffer.
    4216             :  *
    4217             :  * @see GDALCopyWords64()
    4218             :  */
    4219    80550500 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    4220             :                                GDALDataType eSrcType, int nSrcPixelStride,
    4221             :                                void *CPL_RESTRICT pDstData,
    4222             :                                GDALDataType eDstType, int nDstPixelStride,
    4223             :                                int nWordCount)
    4224             : {
    4225    80550500 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    4226             :                     nDstPixelStride, nWordCount);
    4227    80550500 : }
    4228             : 
    4229             : /************************************************************************/
    4230             : /*                          GDALCopyWords64()                           */
    4231             : /************************************************************************/
    4232             : 
    4233             : /**
    4234             :  * Copy pixel words from buffer to buffer.
    4235             :  *
    4236             :  * This function is used to copy pixel word values from one memory buffer
    4237             :  * to another, with support for conversion between data types, and differing
    4238             :  * step factors. The data type conversion is done using the following
    4239             :  * rules:
    4240             :  * <ul>
    4241             :  * <li>Values assigned to a lower range integer type are clipped. For
    4242             :  * instance assigning GDT_Int16 values to a GDT_UInt8 buffer will cause values
    4243             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    4244             :  * </li>
    4245             :  * <li>
    4246             :  * Assignment from floating point to integer rounds to closest integer.
    4247             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    4248             :  * smallest integer. NaN is mapped to 0.
    4249             :  * </li>
    4250             :  * <li>
    4251             :  * Assignment from non-complex to complex will result in the imaginary part
    4252             :  * being set to zero on output.
    4253             :  * </li>
    4254             :  * <li> Assignment from complex to
    4255             :  * non-complex will result in the complex portion being lost and the real
    4256             :  * component being preserved (<i>not magnitude!</i>).
    4257             :  * </li>
    4258             :  * </ul>
    4259             :  *
    4260             :  * No assumptions are made about the source or destination words occurring
    4261             :  * on word boundaries.  It is assumed that all values are in native machine
    4262             :  * byte order.
    4263             :  *
    4264             :  * @param pSrcData Pointer to source data to be converted.
    4265             :  * @param eSrcType the source data type (see GDALDataType enum)
    4266             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    4267             :  * in bytes
    4268             :  * @param pDstData Pointer to buffer where destination data should go
    4269             :  * @param eDstType the destination data type (see GDALDataType enum)
    4270             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    4271             :  * words), in bytes
    4272             :  * @param nWordCount number of words to be copied
    4273             :  *
    4274             :  * @note
    4275             :  * When adding a new data type to GDAL, you must do the following to
    4276             :  * support it properly within the GDALCopyWords function:
    4277             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    4278             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    4279             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    4280             :  *    This should call the appropriate GDALCopyWordsT template.
    4281             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    4282             :  *    above namespace. This will ensure that any conversion issues are
    4283             :  *    handled (cases like the float -> int32 case, where the min/max)
    4284             :  *    values are subject to roundoff error.
    4285             :  */
    4286             : 
    4287   116957000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    4288             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    4289             :                                  void *CPL_RESTRICT pDstData,
    4290             :                                  GDALDataType eDstType, int nDstPixelStride,
    4291             :                                  GPtrDiff_t nWordCount)
    4292             : 
    4293             : {
    4294             :     // On platforms where alignment matters, be careful
    4295   116957000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    4296   116957000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    4297   116957000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    4298             :     {
    4299           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    4300             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    4301             :                  "argument");
    4302           2 :         return;
    4303             :     }
    4304   116957000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    4305    66385300 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    4306    66385300 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    4307    66384900 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    4308    66384800 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    4309             :     {
    4310         905 :         if (eSrcType == eDstType)
    4311             :         {
    4312       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4313             :             {
    4314       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4315             :                        static_cast<const GByte *>(pSrcData) +
    4316       34000 :                            nSrcPixelStride * i,
    4317             :                        nDstDataTypeSize);
    4318             :             }
    4319             :         }
    4320             :         else
    4321             :         {
    4322         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    4323             :             {
    4324             :                 return ptr +
    4325         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    4326         210 :                         align);
    4327             :             };
    4328             : 
    4329             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    4330             :             // be sure to get correctly aligned pointer.
    4331         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    4332             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    4333             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    4334             :             GByte *pabySrcBuffer =
    4335         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    4336             :             GByte *pabyDstBuffer =
    4337         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    4338        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    4339             :             {
    4340        3255 :                 memcpy(pabySrcBuffer,
    4341             :                        static_cast<const GByte *>(pSrcData) +
    4342        3255 :                            nSrcPixelStride * i,
    4343             :                        nSrcDataTypeSize);
    4344        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    4345             :                                 eDstType, 0, 1);
    4346        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    4347             :                        pabyDstBuffer, nDstDataTypeSize);
    4348             :             }
    4349             :         }
    4350         905 :         return;
    4351             :     }
    4352             : 
    4353             :     // Deal with the case where we're replicating a single word into the
    4354             :     // provided buffer
    4355   116956000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    4356             :     {
    4357     1080550 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    4358             :                           nDstPixelStride, nWordCount);
    4359     1080550 :         return;
    4360             :     }
    4361             : 
    4362   115875000 :     if (eSrcType == eDstType)
    4363             :     {
    4364    54818200 :         if (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8)
    4365             :         {
    4366    18155000 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    4367             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    4368             :                          nWordCount);
    4369    18155000 :             return;
    4370             :         }
    4371             : 
    4372    36663300 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    4373    21796400 :             (nDstPixelStride % 2) == 0)
    4374             :         {
    4375    21796400 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    4376             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    4377             :                          nWordCount);
    4378    21796400 :             return;
    4379             :         }
    4380             : 
    4381    14866800 :         if (nWordCount == 1)
    4382             :         {
    4383             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    4384             :             // Avoid false positives...
    4385             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    4386             : #else
    4387    14377000 :             if (nSrcDataTypeSize == 2)
    4388           0 :                 memcpy(pDstData, pSrcData, 2);
    4389    14377000 :             else if (nSrcDataTypeSize == 4)
    4390    13807600 :                 memcpy(pDstData, pSrcData, 4);
    4391      569334 :             else if (nSrcDataTypeSize == 8)
    4392      552729 :                 memcpy(pDstData, pSrcData, 8);
    4393             :             else /* if( eSrcType == GDT_CFloat64 ) */
    4394       16605 :                 memcpy(pDstData, pSrcData, 16);
    4395             : #endif
    4396    14377000 :             return;
    4397             :         }
    4398             : 
    4399             :         // Let memcpy() handle the case where we're copying a packed buffer
    4400             :         // of pixels.
    4401      489869 :         if (nSrcPixelStride == nDstPixelStride)
    4402             :         {
    4403      228025 :             if (nSrcPixelStride == nSrcDataTypeSize)
    4404             :             {
    4405      227945 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    4406      227945 :                 return;
    4407             :             }
    4408             :         }
    4409             :     }
    4410             : 
    4411             :     // Handle the more general case -- deals with conversion of data types
    4412             :     // directly.
    4413    61318900 :     switch (eSrcType)
    4414             :     {
    4415    20316700 :         case GDT_UInt8:
    4416    20316700 :             GDALCopyWordsFromT<unsigned char>(
    4417             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    4418             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4419    20316700 :             break;
    4420        1806 :         case GDT_Int8:
    4421        1806 :             GDALCopyWordsFromT<signed char>(
    4422             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    4423             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4424        1806 :             break;
    4425       55565 :         case GDT_UInt16:
    4426       55565 :             GDALCopyWordsFromT<unsigned short>(
    4427             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    4428             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4429       55565 :             break;
    4430     6519870 :         case GDT_Int16:
    4431     6519870 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4432             :                                       nSrcPixelStride, false, pDstData,
    4433             :                                       eDstType, nDstPixelStride, nWordCount);
    4434     6519870 :             break;
    4435        8282 :         case GDT_UInt32:
    4436        8282 :             GDALCopyWordsFromT<unsigned int>(
    4437             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    4438             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4439        8282 :             break;
    4440    12254800 :         case GDT_Int32:
    4441    12254800 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4442             :                                     nSrcPixelStride, false, pDstData, eDstType,
    4443             :                                     nDstPixelStride, nWordCount);
    4444    12254800 :             break;
    4445        2205 :         case GDT_UInt64:
    4446        2205 :             GDALCopyWordsFromT<std::uint64_t>(
    4447             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    4448             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4449        2205 :             break;
    4450       11739 :         case GDT_Int64:
    4451       11739 :             GDALCopyWordsFromT<std::int64_t>(
    4452             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    4453             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    4454       11739 :             break;
    4455        1387 :         case GDT_Float16:
    4456        1387 :             GDALCopyWordsFromT<GFloat16>(
    4457             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
    4458             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4459        1387 :             break;
    4460      669102 :         case GDT_Float32:
    4461      669102 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4462             :                                       nSrcPixelStride, false, pDstData,
    4463             :                                       eDstType, nDstPixelStride, nWordCount);
    4464      669102 :             break;
    4465    20716400 :         case GDT_Float64:
    4466    20716400 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4467             :                                        nSrcPixelStride, false, pDstData,
    4468             :                                        eDstType, nDstPixelStride, nWordCount);
    4469    20716400 :             break;
    4470      478486 :         case GDT_CInt16:
    4471      478486 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    4472             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4473             :                                       nDstPixelStride, nWordCount);
    4474      478486 :             break;
    4475         868 :         case GDT_CInt32:
    4476         868 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    4477             :                                     nSrcPixelStride, true, pDstData, eDstType,
    4478             :                                     nDstPixelStride, nWordCount);
    4479         868 :             break;
    4480         508 :         case GDT_CFloat16:
    4481         508 :             GDALCopyWordsFromT<GFloat16>(
    4482             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
    4483             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    4484         508 :             break;
    4485        2437 :         case GDT_CFloat32:
    4486        2437 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    4487             :                                       nSrcPixelStride, true, pDstData, eDstType,
    4488             :                                       nDstPixelStride, nWordCount);
    4489        2437 :             break;
    4490      278699 :         case GDT_CFloat64:
    4491      278699 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    4492             :                                        nSrcPixelStride, true, pDstData,
    4493             :                                        eDstType, nDstPixelStride, nWordCount);
    4494      278699 :             break;
    4495           0 :         case GDT_Unknown:
    4496             :         case GDT_TypeCount:
    4497           0 :             CPLAssert(false);
    4498             :     }
    4499             : }
    4500             : 
    4501             : /************************************************************************/
    4502             : /*                            GDALCopyBits()                            */
    4503             : /************************************************************************/
    4504             : 
    4505             : /**
    4506             :  * Bitwise word copying.
    4507             :  *
    4508             :  * A function for moving sets of partial bytes around.  Loosely
    4509             :  * speaking this is a bitwise analog to GDALCopyWords().
    4510             :  *
    4511             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    4512             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    4513             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    4514             :  * and nDstOffset are the offset into the source and destination buffers
    4515             :  * to start at, also measured in bits.
    4516             :  *
    4517             :  * All bit offsets are assumed to start from the high order bit in a byte
    4518             :  * (i.e. most significant bit first).  Currently this function is not very
    4519             :  * optimized, but it may be improved for some common cases in the future
    4520             :  * as needed.
    4521             :  *
    4522             :  * @param pabySrcData the source data buffer.
    4523             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    4524             :  * first word to copy.
    4525             :  * @param nSrcStep the offset in bits from the start one source word to the
    4526             :  * start of the next.
    4527             :  * @param pabyDstData the destination data buffer.
    4528             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    4529             :  * first word to copy over.
    4530             :  * @param nDstStep the offset in bits from the start one word to the
    4531             :  * start of the next.
    4532             :  * @param nBitCount the number of bits in a word to be copied.
    4533             :  * @param nStepCount the number of words to copy.
    4534             :  */
    4535             : 
    4536           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    4537             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    4538             :                   int nBitCount, int nStepCount)
    4539             : 
    4540             : {
    4541           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    4542             : 
    4543           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    4544             :     {
    4545           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    4546             :         {
    4547           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    4548           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    4549             :             else
    4550           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    4551             : 
    4552           0 :             nSrcOffset++;
    4553           0 :             nDstOffset++;
    4554             :         }
    4555             : 
    4556           0 :         nSrcOffset += (nSrcStep - nBitCount);
    4557           0 :         nDstOffset += (nDstStep - nBitCount);
    4558             :     }
    4559             : }
    4560             : 
    4561             : /************************************************************************/
    4562             : /*                    GDALBandGetBestOverviewLevel()                    */
    4563             : /************************************************************************/
    4564             : 
    4565      525465 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand,
    4566             :                                  double dfTargetDownsamplingRatio,
    4567             :                                  double dfOversamplingThreshold)
    4568             : {
    4569      525465 :     int iBestOvr = -1;
    4570      525465 :     double dfBestRatio = 0;
    4571      525465 :     const int nOvCount = poBand->GetOverviewCount();
    4572      525465 :     constexpr double EPSILON = 1e-1;
    4573     1053620 :     for (int iOvr = -1; iOvr < nOvCount; iOvr++)
    4574             :     {
    4575      531092 :         double dfOvrRatio = 1.0;
    4576      531092 :         GDALRasterBand *poOvrBand = nullptr;
    4577      531092 :         if (iOvr >= 0)
    4578             :         {
    4579        5627 :             poOvrBand = poBand->GetOverview(iOvr);
    4580       11254 :             if (poOvrBand == nullptr ||
    4581       11253 :                 poOvrBand->GetXSize() > poBand->GetXSize() ||
    4582        5626 :                 poOvrBand->GetYSize() > poBand->GetYSize())
    4583             :             {
    4584           1 :                 continue;
    4585             :             }
    4586       22504 :             dfOvrRatio = std::min(static_cast<double>(poBand->GetXSize()) /
    4587        5626 :                                       poOvrBand->GetXSize(),
    4588       11252 :                                   static_cast<double>(poBand->GetYSize()) /
    4589       11252 :                                       poOvrBand->GetYSize());
    4590             :         }
    4591             : 
    4592             :         // Is it nearly the requested factor and better (lower) than
    4593             :         // the current best factor?
    4594             :         // Use an epsilon because of numerical instability.
    4595      531197 :         if (dfOvrRatio >=
    4596      531091 :                 dfTargetDownsamplingRatio * dfOversamplingThreshold + EPSILON ||
    4597             :             dfOvrRatio <= dfBestRatio)
    4598             :         {
    4599         106 :             continue;
    4600             :         }
    4601             : 
    4602      530985 :         if (poOvrBand)
    4603             :         {
    4604             :             // Ignore AVERAGE_BIT2GRAYSCALE overviews.
    4605             :             const char *pszResampling =
    4606        5520 :                 poOvrBand->GetMetadataItem("RESAMPLING");
    4607        5520 :             if (pszResampling != nullptr &&
    4608          71 :                 STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    4609             :             {
    4610          16 :                 continue;
    4611             :             }
    4612             :         }
    4613             : 
    4614      530969 :         iBestOvr = iOvr;
    4615      530969 :         dfBestRatio = dfOvrRatio;
    4616      530969 :         if (std::abs(dfTargetDownsamplingRatio - dfOvrRatio) < EPSILON)
    4617             :         {
    4618        2938 :             break;
    4619             :         }
    4620             :     }
    4621      525465 :     return iBestOvr;
    4622             : }
    4623             : 
    4624             : /************************************************************************/
    4625             : /*                    GDALGetBestOverviewLevel()                        */
    4626             : /*                                                                      */
    4627             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4628             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    4629             : /* returning a valid overview level                                     */
    4630             : /************************************************************************/
    4631             : 
    4632           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    4633             :                                  int &nXSize, int &nYSize, int nBufXSize,
    4634             :                                  int nBufYSize)
    4635             : {
    4636           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    4637           0 :                                          nBufXSize, nBufYSize, nullptr);
    4638             : }
    4639             : 
    4640      525558 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    4641             :                                   int &nYOff, int &nXSize, int &nYSize,
    4642             :                                   int nBufXSize, int nBufYSize,
    4643             :                                   GDALRasterIOExtraArg *psExtraArg)
    4644             : {
    4645      525558 :     if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
    4646      525558 :         psExtraArg->bUseOnlyThisScale)
    4647         109 :         return -1;
    4648             :     /* -------------------------------------------------------------------- */
    4649             :     /*      Compute the desired downsampling factor.  It is                 */
    4650             :     /*      based on the least reduced axis, and represents the number      */
    4651             :     /*      of source pixels to one destination pixel.                      */
    4652             :     /* -------------------------------------------------------------------- */
    4653      525449 :     const double dfDesiredDownsamplingFactor =
    4654      525449 :         ((nXSize / static_cast<double>(nBufXSize)) <
    4655      363109 :              (nYSize / static_cast<double>(nBufYSize)) ||
    4656             :          nBufYSize == 1)
    4657      755374 :             ? nXSize / static_cast<double>(nBufXSize)
    4658      133184 :             : nYSize / static_cast<double>(nBufYSize);
    4659             : 
    4660             :     /* -------------------------------------------------------------------- */
    4661             :     /*      Find the overview level that largest downsampling factor (most  */
    4662             :     /*      downsampled) that is still less than (or only a little more)    */
    4663             :     /*      downsampled than the request.                                   */
    4664             :     /* -------------------------------------------------------------------- */
    4665             : 
    4666             :     const char *pszOversampligThreshold =
    4667      525449 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    4668             : 
    4669             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    4670             :     const double dfOversamplingThreshold =
    4671     1050890 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    4672      525440 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    4673     1050880 :             ? 1.0
    4674      525449 :             : 1.2;
    4675      525449 :     const int iBestOvrLevel = GDALBandGetBestOverviewLevel(
    4676             :         poBand, dfDesiredDownsamplingFactor, dfOversamplingThreshold);
    4677             : 
    4678             :     /* -------------------------------------------------------------------- */
    4679             :     /*      If we didn't find an overview that helps us, just return        */
    4680             :     /*      indicating failure and the full resolution image will be used.  */
    4681             :     /* -------------------------------------------------------------------- */
    4682      525449 :     if (iBestOvrLevel < 0)
    4683      522456 :         return -1;
    4684        2993 :     const GDALRasterBand *poBestOverview = poBand->GetOverview(iBestOvrLevel);
    4685             : 
    4686             :     /* -------------------------------------------------------------------- */
    4687             :     /*      Recompute the source window in terms of the selected            */
    4688             :     /*      overview.                                                       */
    4689             :     /* -------------------------------------------------------------------- */
    4690             :     const double dfXFactor =
    4691        2993 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    4692             :     const double dfYFactor =
    4693        2993 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    4694        2993 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    4695             :              poBestOverview->GetYSize());
    4696             : 
    4697        8979 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    4698        2993 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    4699        8979 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    4700        2993 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    4701        2993 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    4702        2993 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    4703        2993 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    4704           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    4705        2993 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    4706           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    4707             : 
    4708        2993 :     if (psExtraArg)
    4709             :     {
    4710        2993 :         if (psExtraArg->bFloatingPointWindowValidity)
    4711             :         {
    4712         117 :             psExtraArg->dfXOff /= dfXFactor;
    4713         117 :             psExtraArg->dfXSize /= dfXFactor;
    4714         117 :             psExtraArg->dfYOff /= dfYFactor;
    4715         117 :             psExtraArg->dfYSize /= dfYFactor;
    4716             :         }
    4717        2876 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    4718             :         {
    4719          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    4720          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    4721          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    4722          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    4723          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    4724             :         }
    4725             :     }
    4726             : 
    4727        2993 :     nXOff = nOXOff;
    4728        2993 :     nYOff = nOYOff;
    4729        2993 :     nXSize = nOXSize;
    4730        2993 :     nYSize = nOYSize;
    4731             : 
    4732        2993 :     return iBestOvrLevel;
    4733             : }
    4734             : 
    4735             : /************************************************************************/
    4736             : /*                          OverviewRasterIO()                          */
    4737             : /*                                                                      */
    4738             : /*      Special work function to utilize available overviews to         */
    4739             : /*      more efficiently satisfy downsampled requests.  It will         */
    4740             : /*      return CE_Failure if there are no appropriate overviews         */
    4741             : /*      available but it doesn't emit any error messages.               */
    4742             : /************************************************************************/
    4743             : 
    4744             : //! @cond Doxygen_Suppress
    4745           1 : CPLErr GDALRasterBand::OverviewRasterIO(
    4746             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4747             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4748             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    4749             : 
    4750             : {
    4751             :     GDALRasterIOExtraArg sExtraArg;
    4752           1 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4753             : 
    4754           1 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    4755             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    4756           1 :     if (nOverview < 0)
    4757           1 :         return CE_Failure;
    4758             : 
    4759             :     /* -------------------------------------------------------------------- */
    4760             :     /*      Recast the call in terms of the new raster layer.               */
    4761             :     /* -------------------------------------------------------------------- */
    4762           0 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    4763           0 :     if (poOverviewBand == nullptr)
    4764           0 :         return CE_Failure;
    4765             : 
    4766           0 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4767             :                                     pData, nBufXSize, nBufYSize, eBufType,
    4768           0 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    4769             : }
    4770             : 
    4771             : /************************************************************************/
    4772             : /*                        TryOverviewRasterIO()                         */
    4773             : /************************************************************************/
    4774             : 
    4775      362429 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    4776             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4777             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4778             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    4779             :     int *pbTried)
    4780             : {
    4781      362429 :     int nXOffMod = nXOff;
    4782      362429 :     int nYOffMod = nYOff;
    4783      362429 :     int nXSizeMod = nXSize;
    4784      362429 :     int nYSizeMod = nYSize;
    4785             :     GDALRasterIOExtraArg sExtraArg;
    4786             : 
    4787      362429 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4788             : 
    4789      362429 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4790             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    4791             :         &sExtraArg);
    4792             : 
    4793      362429 :     if (iOvrLevel >= 0)
    4794             :     {
    4795          53 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    4796          53 :         if (poOverviewBand)
    4797             :         {
    4798          53 :             *pbTried = TRUE;
    4799          53 :             return poOverviewBand->RasterIO(
    4800             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    4801             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    4802          53 :                 &sExtraArg);
    4803             :         }
    4804             :     }
    4805             : 
    4806      362376 :     *pbTried = FALSE;
    4807      362376 :     return CE_None;
    4808             : }
    4809             : 
    4810             : /************************************************************************/
    4811             : /*                        TryOverviewRasterIO()                         */
    4812             : /************************************************************************/
    4813             : 
    4814      160154 : CPLErr GDALDataset::TryOverviewRasterIO(
    4815             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4816             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4817             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4818             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    4819             :     int *pbTried)
    4820             : {
    4821      160154 :     int nXOffMod = nXOff;
    4822      160154 :     int nYOffMod = nYOff;
    4823      160154 :     int nXSizeMod = nXSize;
    4824      160154 :     int nYSizeMod = nYSize;
    4825             :     GDALRasterIOExtraArg sExtraArg;
    4826      160154 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4827             : 
    4828      320308 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    4829      160154 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    4830             :         nBufYSize, &sExtraArg);
    4831             : 
    4832      160197 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    4833          43 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    4834             :     {
    4835          43 :         *pbTried = TRUE;
    4836          43 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    4837             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    4838             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    4839          43 :             nLineSpace, nBandSpace, &sExtraArg);
    4840             :     }
    4841             :     else
    4842             :     {
    4843      160111 :         *pbTried = FALSE;
    4844      160111 :         return CE_None;
    4845             :     }
    4846             : }
    4847             : 
    4848             : /************************************************************************/
    4849             : /*                        GetBestOverviewLevel()                        */
    4850             : /*                                                                      */
    4851             : /* Returns the best overview level to satisfy the query or -1 if none   */
    4852             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    4853             : /* overview level                                                       */
    4854             : /************************************************************************/
    4855             : 
    4856           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    4857             :                                            int &nYOff, int &nXSize, int &nYSize,
    4858             :                                            int nBufXSize, int nBufYSize,
    4859             :                                            int nBandCount,
    4860             :                                            const int *panBandMap,
    4861             :                                            GDALRasterIOExtraArg *psExtraArg)
    4862             : {
    4863           4 :     int nOverviewCount = 0;
    4864           4 :     GDALRasterBand *poFirstBand = nullptr;
    4865             : 
    4866             :     /* -------------------------------------------------------------------- */
    4867             :     /* Check that all bands have the same number of overviews and           */
    4868             :     /* that they have all the same size and block dimensions                */
    4869             :     /* -------------------------------------------------------------------- */
    4870          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4871             :     {
    4872           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    4873           8 :         if (poBand == nullptr)
    4874           0 :             return -1;
    4875           8 :         if (iBand == 0)
    4876             :         {
    4877           4 :             poFirstBand = poBand;
    4878           4 :             nOverviewCount = poBand->GetOverviewCount();
    4879             :         }
    4880           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    4881             :         {
    4882           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4883             :                              "mismatched overview count, use std method.");
    4884           0 :             return -1;
    4885             :         }
    4886             :         else
    4887             :         {
    4888           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    4889             :             {
    4890           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    4891             :                 GDALRasterBand *poOvrFirstBand =
    4892           0 :                     poFirstBand->GetOverview(iOverview);
    4893           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    4894           0 :                     continue;
    4895             : 
    4896           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    4897           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    4898             :                 {
    4899           0 :                     CPLDebug("GDAL",
    4900             :                              "GDALDataset::GetBestOverviewLevel() ... "
    4901             :                              "mismatched overview sizes, use std method.");
    4902           0 :                     return -1;
    4903             :                 }
    4904           0 :                 int nBlockXSizeFirst = 0;
    4905           0 :                 int nBlockYSizeFirst = 0;
    4906           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    4907             :                                              &nBlockYSizeFirst);
    4908             : 
    4909           0 :                 int nBlockXSizeCurrent = 0;
    4910           0 :                 int nBlockYSizeCurrent = 0;
    4911           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    4912             :                                         &nBlockYSizeCurrent);
    4913             : 
    4914           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    4915           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    4916             :                 {
    4917           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    4918             :                                      "mismatched block sizes, use std method.");
    4919           0 :                     return -1;
    4920             :                 }
    4921             :             }
    4922             :         }
    4923             :     }
    4924           4 :     if (poFirstBand == nullptr)
    4925           0 :         return -1;
    4926             : 
    4927           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    4928             :                                          nYSize, nBufXSize, nBufYSize,
    4929           4 :                                          psExtraArg);
    4930             : }
    4931             : 
    4932             : /************************************************************************/
    4933             : /*                         BlockBasedRasterIO()                         */
    4934             : /*                                                                      */
    4935             : /*      This convenience function implements a dataset level            */
    4936             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    4937             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    4938             : /*      all bands at once, so that a format driver that handles a       */
    4939             : /*      request for different bands of the same block efficiently       */
    4940             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    4941             : /*                                                                      */
    4942             : /*      This method is intended to be called by an overridden           */
    4943             : /*      IRasterIO() method in the driver specific GDALDataset           */
    4944             : /*      derived class.                                                  */
    4945             : /*                                                                      */
    4946             : /*      Default internal implementation of RasterIO() ... utilizes      */
    4947             : /*      the Block access methods to satisfy the request.  This would    */
    4948             : /*      normally only be overridden by formats with overviews.          */
    4949             : /*                                                                      */
    4950             : /*      To keep things relatively simple, this method does not          */
    4951             : /*      currently take advantage of some special cases addressed in     */
    4952             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    4953             : /*      call it when you know it will help.  That is in cases where     */
    4954             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    4955             : /*      implementing interleaved IO efficiently on a block by block     */
    4956             : /*      basis. Overviews will be used when possible.                    */
    4957             : /************************************************************************/
    4958             : 
    4959       65948 : CPLErr GDALDataset::BlockBasedRasterIO(
    4960             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4961             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4962             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4963             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    4964             : 
    4965             : {
    4966       65948 :     CPLAssert(nullptr != pData);
    4967             : 
    4968       65948 :     GByte **papabySrcBlock = nullptr;
    4969       65948 :     GDALRasterBlock *poBlock = nullptr;
    4970       65948 :     GDALRasterBlock **papoBlocks = nullptr;
    4971       65948 :     int nLBlockX = -1;
    4972       65948 :     int nLBlockY = -1;
    4973             :     int iBufYOff;
    4974             :     int iBufXOff;
    4975       65948 :     int nBlockXSize = 1;
    4976       65948 :     int nBlockYSize = 1;
    4977       65948 :     CPLErr eErr = CE_None;
    4978       65948 :     GDALDataType eDataType = GDT_UInt8;
    4979             : 
    4980       65948 :     const bool bUseIntegerRequestCoords =
    4981       65991 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4982          43 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4983          41 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4984             : 
    4985             :     /* -------------------------------------------------------------------- */
    4986             :     /*      Ensure that all bands share a common block size and data type.  */
    4987             :     /* -------------------------------------------------------------------- */
    4988      312052 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4989             :     {
    4990      246104 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4991             : 
    4992      246104 :         if (iBand == 0)
    4993             :         {
    4994       65948 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4995       65948 :             eDataType = poBand->GetRasterDataType();
    4996             :         }
    4997             :         else
    4998             :         {
    4999      180156 :             int nThisBlockXSize = 0;
    5000      180156 :             int nThisBlockYSize = 0;
    5001      180156 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    5002      180156 :             if (nThisBlockXSize != nBlockXSize ||
    5003      180156 :                 nThisBlockYSize != nBlockYSize)
    5004             :             {
    5005           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    5006             :                                  "mismatched block sizes, use std method.");
    5007           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    5008             :                                          pData, nBufXSize, nBufYSize, eBufType,
    5009             :                                          nBandCount, panBandMap, nPixelSpace,
    5010           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    5011             :             }
    5012             : 
    5013      180156 :             if (eDataType != poBand->GetRasterDataType() &&
    5014           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    5015             :             {
    5016           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    5017             :                                  "mismatched band data types, use std method.");
    5018           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    5019             :                                          pData, nBufXSize, nBufYSize, eBufType,
    5020             :                                          nBandCount, panBandMap, nPixelSpace,
    5021           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    5022             :             }
    5023             :         }
    5024             :     }
    5025             : 
    5026             :     /* ==================================================================== */
    5027             :     /*      In this special case at full resolution we step through in      */
    5028             :     /*      blocks, turning the request over to the per-band                */
    5029             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    5030             :     /*      called before proceeding to the next.                           */
    5031             :     /* ==================================================================== */
    5032             : 
    5033       65948 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    5034             :     {
    5035             :         GDALRasterIOExtraArg sDummyExtraArg;
    5036       65944 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    5037             : 
    5038       65944 :         int nChunkYSize = 0;
    5039       65944 :         int nChunkXSize = 0;
    5040             : 
    5041      215391 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    5042             :         {
    5043      150463 :             const int nChunkYOff = iBufYOff + nYOff;
    5044      300926 :             nChunkYSize = std::min(nBlockYSize - (nChunkYOff % nBlockYSize),
    5045      150463 :                                    (nYOff + nYSize) - nChunkYOff);
    5046             : 
    5047      825901 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    5048             :             {
    5049      676453 :                 const int nChunkXOff = iBufXOff + nXOff;
    5050     1352910 :                 nChunkXSize = std::min(nBlockXSize - (nChunkXOff % nBlockXSize),
    5051      676453 :                                        (nXOff + nXSize) - nChunkXOff);
    5052             : 
    5053      676453 :                 GByte *pabyChunkData =
    5054      676453 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    5055      676453 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    5056             : 
    5057     3291120 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    5058             :                 {
    5059     2615680 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    5060             : 
    5061     5231370 :                     eErr = poBand->IRasterIO(
    5062             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    5063             :                         nChunkYSize,
    5064     2615680 :                         pabyChunkData +
    5065     2615680 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    5066             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    5067     2615680 :                         nLineSpace, &sDummyExtraArg);
    5068     2615680 :                     if (eErr != CE_None)
    5069        1015 :                         return eErr;
    5070             :                 }
    5071             :             }
    5072             : 
    5073      168362 :             if (psExtraArg->pfnProgress != nullptr &&
    5074       18914 :                 !psExtraArg->pfnProgress(
    5075      168362 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    5076             :                         nBufYSize,
    5077             :                     "", psExtraArg->pProgressData))
    5078             :             {
    5079           1 :                 return CE_Failure;
    5080             :             }
    5081             :         }
    5082             : 
    5083       64928 :         return CE_None;
    5084             :     }
    5085             : 
    5086             :     /* Below code is not compatible with that case. It would need a complete */
    5087             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    5088           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    5089             :     {
    5090           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    5091             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    5092             :                                  panBandMap, nPixelSpace, nLineSpace,
    5093           0 :                                  nBandSpace, psExtraArg);
    5094             :     }
    5095             : 
    5096             :     /* We could have a smarter implementation, but that will do for now */
    5097           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    5098           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    5099             :     {
    5100           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    5101             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    5102             :                                  panBandMap, nPixelSpace, nLineSpace,
    5103           0 :                                  nBandSpace, psExtraArg);
    5104             :     }
    5105             : 
    5106             :     /* ==================================================================== */
    5107             :     /*      Loop reading required source blocks to satisfy output           */
    5108             :     /*      request.  This is the most general implementation.              */
    5109             :     /* ==================================================================== */
    5110             : 
    5111           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    5112             : 
    5113             :     papabySrcBlock =
    5114           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    5115             :     papoBlocks =
    5116           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    5117             : 
    5118             :     /* -------------------------------------------------------------------- */
    5119             :     /*      Select an overview level if appropriate.                        */
    5120             :     /* -------------------------------------------------------------------- */
    5121             : 
    5122             :     GDALRasterIOExtraArg sExtraArg;
    5123           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    5124           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    5125             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    5126             :         panBandMap, &sExtraArg);
    5127           4 :     if (nOverviewLevel >= 0)
    5128             :     {
    5129           2 :         GetRasterBand(panBandMap[0])
    5130           2 :             ->GetOverview(nOverviewLevel)
    5131           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    5132             :     }
    5133             : 
    5134           4 :     double dfXOff = nXOff;
    5135           4 :     double dfYOff = nYOff;
    5136           4 :     double dfXSize = nXSize;
    5137           4 :     double dfYSize = nYSize;
    5138           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    5139             :     {
    5140           2 :         dfXOff = sExtraArg.dfXOff;
    5141           2 :         dfYOff = sExtraArg.dfYOff;
    5142           2 :         dfXSize = sExtraArg.dfXSize;
    5143           2 :         dfYSize = sExtraArg.dfYSize;
    5144             :     }
    5145             : 
    5146             :     /* -------------------------------------------------------------------- */
    5147             :     /*      Compute stepping increment.                                     */
    5148             :     /* -------------------------------------------------------------------- */
    5149           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    5150           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    5151             : 
    5152           4 :     constexpr double EPS = 1e-10;
    5153             :     /* -------------------------------------------------------------------- */
    5154             :     /*      Loop over buffer computing source locations.                    */
    5155             :     /* -------------------------------------------------------------------- */
    5156          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    5157             :     {
    5158             :         GPtrDiff_t iSrcOffset;
    5159             : 
    5160             :         // Add small epsilon to avoid some numeric precision issues.
    5161          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    5162          32 :         const int iSrcY = static_cast<int>(std::min(
    5163          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    5164             : 
    5165          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    5166             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    5167             : 
    5168         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    5169             :         {
    5170         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    5171         270 :             const int iSrcX = static_cast<int>(std::min(
    5172         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    5173             : 
    5174             :             // FIXME: this code likely doesn't work if the dirty block gets
    5175             :             // flushed to disk before being completely written. In the meantime,
    5176             :             // bJustInitialize should probably be set to FALSE even if it is not
    5177             :             // ideal performance wise, and for lossy compression
    5178             : 
    5179             :             /* --------------------------------------------------------------------
    5180             :              */
    5181             :             /*      Ensure we have the appropriate block loaded. */
    5182             :             /* --------------------------------------------------------------------
    5183             :              */
    5184         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    5185         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    5186         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    5187         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    5188             :             {
    5189           4 :                 nLBlockX = iSrcX / nBlockXSize;
    5190           4 :                 nLBlockY = iSrcY / nBlockYSize;
    5191             : 
    5192           4 :                 const bool bJustInitialize =
    5193           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    5194           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    5195           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    5196           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    5197             :                 /*bool bMemZeroBuffer = FALSE;
    5198             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    5199             :                     nXOff <= nLBlockX * nBlockXSize &&
    5200             :                     nYOff <= nLBlockY * nBlockYSize &&
    5201             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    5202             :                      (nXOff + nXSize == GetRasterXSize() &&
    5203             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    5204             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    5205             :                      (nYOff + nYSize == GetRasterYSize() &&
    5206             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    5207             :                 {
    5208             :                     bJustInitialize = TRUE;
    5209             :                     bMemZeroBuffer = TRUE;
    5210             :                 }*/
    5211          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    5212             :                 {
    5213           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    5214           8 :                     if (nOverviewLevel >= 0)
    5215           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    5216          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    5217           8 :                                                         bJustInitialize);
    5218           8 :                     if (poBlock == nullptr)
    5219             :                     {
    5220           0 :                         eErr = CE_Failure;
    5221           0 :                         goto CleanupAndReturn;
    5222             :                     }
    5223             : 
    5224           8 :                     if (eRWFlag == GF_Write)
    5225           0 :                         poBlock->MarkDirty();
    5226             : 
    5227           8 :                     if (papoBlocks[iBand] != nullptr)
    5228           0 :                         papoBlocks[iBand]->DropLock();
    5229             : 
    5230           8 :                     papoBlocks[iBand] = poBlock;
    5231             : 
    5232           8 :                     papabySrcBlock[iBand] =
    5233           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    5234             :                     /*if( bMemZeroBuffer )
    5235             :                     {
    5236             :                         memset(papabySrcBlock[iBand], 0,
    5237             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    5238             :                     * nBlockYSize);
    5239             :                     }*/
    5240             :                 }
    5241             :             }
    5242             : 
    5243             :             /* --------------------------------------------------------------------
    5244             :              */
    5245             :             /*      Copy over this pixel of data. */
    5246             :             /* --------------------------------------------------------------------
    5247             :              */
    5248         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    5249         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    5250         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    5251         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    5252         270 :                               nBlockXSize) *
    5253         270 :                          nBandDataSize;
    5254             : 
    5255         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    5256             :             {
    5257         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    5258         710 :                 GPtrDiff_t iBandBufOffset =
    5259         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    5260             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    5261             : 
    5262         710 :                 if (eDataType == eBufType)
    5263             :                 {
    5264         710 :                     if (eRWFlag == GF_Read)
    5265         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    5266         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    5267             :                     else
    5268           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    5269             :                                static_cast<const GByte *>(pData) +
    5270           0 :                                    iBandBufOffset,
    5271             :                                nBandDataSize);
    5272             :                 }
    5273             :                 else
    5274             :                 {
    5275             :                     /* type to type conversion ... ouch, this is expensive way
    5276             :                        of handling single words */
    5277             : 
    5278           0 :                     if (eRWFlag == GF_Read)
    5279           0 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
    5280             :                                         static_cast<GByte *>(pData) +
    5281           0 :                                             iBandBufOffset,
    5282             :                                         eBufType, 0, 1);
    5283             :                     else
    5284           0 :                         GDALCopyWords64(static_cast<const GByte *>(pData) +
    5285           0 :                                             iBandBufOffset,
    5286           0 :                                         eBufType, 0, pabySrcBlock + iSrcOffset,
    5287             :                                         eDataType, 0, 1);
    5288             :                 }
    5289             :             }
    5290             : 
    5291         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    5292             :         }
    5293             :     }
    5294             : 
    5295             :     /* -------------------------------------------------------------------- */
    5296             :     /*      CleanupAndReturn.                                               */
    5297             :     /* -------------------------------------------------------------------- */
    5298           4 : CleanupAndReturn:
    5299           4 :     CPLFree(papabySrcBlock);
    5300           4 :     if (papoBlocks != nullptr)
    5301             :     {
    5302          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    5303             :         {
    5304           8 :             if (papoBlocks[iBand] != nullptr)
    5305           8 :                 papoBlocks[iBand]->DropLock();
    5306             :         }
    5307           4 :         CPLFree(papoBlocks);
    5308             :     }
    5309             : 
    5310           4 :     return eErr;
    5311             : }
    5312             : 
    5313             : //! @endcond
    5314             : 
    5315             : /************************************************************************/
    5316             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    5317             : /************************************************************************/
    5318             : 
    5319        3406 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    5320             :                                             GDALRasterBand *poDstPrototypeBand,
    5321             :                                             int nBandCount,
    5322             :                                             int bDstIsCompressed,
    5323             :                                             int bInterleave, int *pnSwathCols,
    5324             :                                             int *pnSwathLines)
    5325             : {
    5326        3406 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5327        3406 :     int nSrcBlockXSize = 0;
    5328        3406 :     int nSrcBlockYSize = 0;
    5329        3406 :     int nBlockXSize = 0;
    5330        3406 :     int nBlockYSize = 0;
    5331             : 
    5332        3406 :     int nXSize = poSrcPrototypeBand->GetXSize();
    5333        3406 :     int nYSize = poSrcPrototypeBand->GetYSize();
    5334             : 
    5335        3406 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    5336        3406 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    5337             : 
    5338        3406 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    5339        3406 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    5340             : 
    5341        3406 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5342        3406 :     if (bInterleave)
    5343         585 :         nPixelSize *= nBandCount;
    5344             : 
    5345             :     // aim for one row of blocks.  Do not settle for less.
    5346        3406 :     int nSwathCols = nXSize;
    5347        3406 :     int nSwathLines = nMaxBlockYSize;
    5348             : 
    5349        6812 :     const char *pszSrcCompression = poSrcPrototypeBand->GetMetadataItem(
    5350        3406 :         GDALMD_COMPRESSION, GDAL_MDD_IMAGE_STRUCTURE);
    5351        3406 :     if (pszSrcCompression == nullptr)
    5352             :     {
    5353        3386 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    5354        3386 :         if (poSrcDS)
    5355        3380 :             pszSrcCompression = poSrcDS->GetMetadataItem(
    5356        3380 :                 GDALMD_COMPRESSION, GDAL_MDD_IMAGE_STRUCTURE);
    5357             :     }
    5358             : 
    5359             :     /* -------------------------------------------------------------------- */
    5360             :     /*      What will our swath size be?                                    */
    5361             :     /* -------------------------------------------------------------------- */
    5362             :     // When writing interleaved data in a compressed format, we want to be sure
    5363             :     // that each block will only be written once, so the swath size must not be
    5364             :     // greater than the block cache.
    5365        3406 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    5366             :     int nTargetSwathSize;
    5367        3406 :     if (pszSwathSize != nullptr)
    5368           0 :         nTargetSwathSize = static_cast<int>(
    5369           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    5370             :     else
    5371             :     {
    5372             :         // As a default, take one 1/4 of the cache size.
    5373        3406 :         nTargetSwathSize = static_cast<int>(
    5374        3406 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    5375             : 
    5376             :         // but if the minimum idal swath buf size is less, then go for it to
    5377             :         // avoid unnecessarily abusing RAM usage.
    5378             :         // but try to use 10 MB at least.
    5379        3406 :         GIntBig nIdealSwathBufSize =
    5380        3406 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    5381        3406 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    5382             : 
    5383        3406 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    5384        3406 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    5385             :         {
    5386           1 :             nMinTargetSwathSize = nTargetSwathSize;
    5387             :         }
    5388             : 
    5389        3406 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    5390        3396 :             nIdealSwathBufSize < nMinTargetSwathSize)
    5391             :         {
    5392        3393 :             nIdealSwathBufSize = nMinTargetSwathSize;
    5393             :         }
    5394             : 
    5395        3406 :         if (pszSrcCompression != nullptr &&
    5396         185 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    5397           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    5398           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    5399             :         {
    5400           2 :             nIdealSwathBufSize =
    5401           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    5402           2 :                                                  nSrcBlockYSize * nPixelSize);
    5403             :         }
    5404        3406 :         if (nTargetSwathSize > nIdealSwathBufSize)
    5405        3393 :             nTargetSwathSize = static_cast<int>(
    5406        3393 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    5407             :     }
    5408             : 
    5409        3406 :     if (nTargetSwathSize < 1000000)
    5410           8 :         nTargetSwathSize = 1000000;
    5411             : 
    5412             :     /* But let's check that  */
    5413        3627 :     if (bDstIsCompressed && bInterleave &&
    5414         221 :         nTargetSwathSize > GDALGetCacheMax64())
    5415             :     {
    5416           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    5417             :                  "When translating into a compressed interleave format, "
    5418             :                  "the block cache size (" CPL_FRMT_GIB ") "
    5419             :                  "should be at least the size of the swath (%d) "
    5420             :                  "(GDAL_SWATH_SIZE config. option)",
    5421             :                  GDALGetCacheMax64(), nTargetSwathSize);
    5422             :     }
    5423             : 
    5424             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    5425             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    5426             : 
    5427             :     // if both input and output datasets are tiled, that the tile dimensions
    5428             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    5429             :     // of input and output block dimensions.
    5430        3406 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    5431          47 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    5432          47 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    5433          47 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5434          47 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5435             :     {
    5436          47 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    5437          47 :                 nPixelSize <=
    5438          47 :             static_cast<GIntBig>(nTargetSwathSize))
    5439             :         {
    5440          47 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    5441          47 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    5442          47 :             if (nSwathCols == 0)
    5443           0 :                 nSwathCols = nMaxBlockXSize;
    5444          47 :             if (nSwathCols > nXSize)
    5445          45 :                 nSwathCols = nXSize;
    5446          47 :             nSwathLines = nMaxBlockYSize;
    5447             : 
    5448          47 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    5449          47 :                 static_cast<GIntBig>(nTargetSwathSize))
    5450             :             {
    5451           0 :                 nSwathCols = nXSize;
    5452           0 :                 nSwathLines = nBlockYSize;
    5453             :             }
    5454             :         }
    5455             :     }
    5456             : 
    5457        3406 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    5458        3406 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    5459        3406 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    5460             :     {
    5461           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    5462           1 :         if (nSwathLines == 0)
    5463           1 :             nSwathLines = 1;
    5464             : 
    5465           1 :         CPLDebug(
    5466             :             "GDAL",
    5467             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    5468             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    5469             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    5470           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    5471             :     }
    5472             :     // If we are processing single scans, try to handle several at once.
    5473             :     // If we are handling swaths already, only grow the swath if a row
    5474             :     // of blocks is substantially less than our target buffer size.
    5475        3405 :     else if (nSwathLines == 1 ||
    5476        2851 :              nMemoryPerCol * nSwathLines <
    5477        2851 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    5478             :     {
    5479        3377 :         nSwathLines = std::min(
    5480             :             nYSize,
    5481        3377 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    5482             : 
    5483             :         /* If possible try to align to source and target block height */
    5484        3377 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    5485         273 :             nSwathLines > nMaxBlockYSize &&
    5486         273 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    5487         244 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    5488         217 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    5489             :     }
    5490             : 
    5491        3406 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    5492           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    5493           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    5494             :     {
    5495             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    5496           2 :         if (nSwathLines < nSrcBlockYSize)
    5497             :         {
    5498           0 :             nSwathLines = nSrcBlockYSize;
    5499             : 
    5500             :             // Number of pixels that can be read/write simultaneously.
    5501           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    5502           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    5503           0 :             if (nSwathCols == 0)
    5504           0 :                 nSwathCols = nSrcBlockXSize;
    5505           0 :             if (nSwathCols > nXSize)
    5506           0 :                 nSwathCols = nXSize;
    5507             : 
    5508           0 :             CPLDebug(
    5509             :                 "GDAL",
    5510             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5511             :                 "too high block, "
    5512             :                 "use partial width at one time");
    5513             :         }
    5514           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    5515             :         {
    5516             :             /* Round on a multiple of nSrcBlockYSize */
    5517           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    5518           0 :             CPLDebug(
    5519             :                 "GDAL",
    5520             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5521             :                 "round nSwathLines to block height : %d",
    5522             :                 nSwathLines);
    5523             :         }
    5524             :     }
    5525        3404 :     else if (bDstIsCompressed)
    5526             :     {
    5527         426 :         if (nSwathLines < nBlockYSize)
    5528             :         {
    5529         153 :             nSwathLines = nBlockYSize;
    5530             : 
    5531             :             // Number of pixels that can be read/write simultaneously.
    5532         153 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    5533         153 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    5534         153 :             if (nSwathCols == 0)
    5535           0 :                 nSwathCols = nBlockXSize;
    5536         153 :             if (nSwathCols > nXSize)
    5537         153 :                 nSwathCols = nXSize;
    5538             : 
    5539         153 :             CPLDebug(
    5540             :                 "GDAL",
    5541             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    5542             :                 "too high block, "
    5543             :                 "use partial width at one time");
    5544             :         }
    5545         273 :         else if ((nSwathLines % nBlockYSize) != 0)
    5546             :         {
    5547             :             // Round on a multiple of nBlockYSize.
    5548           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    5549           9 :             CPLDebug(
    5550             :                 "GDAL",
    5551             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    5552             :                 "round nSwathLines to block height : %d",
    5553             :                 nSwathLines);
    5554             :         }
    5555             :     }
    5556             : 
    5557        3406 :     *pnSwathCols = nSwathCols;
    5558        3406 :     *pnSwathLines = nSwathLines;
    5559        3406 : }
    5560             : 
    5561             : /************************************************************************/
    5562             : /*                     GDALDatasetCopyWholeRaster()                     */
    5563             : /************************************************************************/
    5564             : 
    5565             : /**
    5566             :  * \brief Copy all dataset raster data.
    5567             :  *
    5568             :  * This function copies the complete raster contents of one dataset to
    5569             :  * another similarly configured dataset.  The source and destination
    5570             :  * dataset must have the same number of bands, and the same width
    5571             :  * and height.  The bands do not have to have the same data type.
    5572             :  *
    5573             :  * This function is primarily intended to support implementation of
    5574             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    5575             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    5576             :  * performing the transfer in a pixel interleaved fashion.
    5577             :  *
    5578             :  * Currently the only papszOptions value supported are :
    5579             :  * <ul>
    5580             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    5581             :  * write access pattern (this does not modify the layout of the destination
    5582             :  * data)</li>
    5583             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block
    5584             :  * sizes to achieve best compression.</li>
    5585             :  * <li>"SKIP_HOLES=YES" to skip chunks
    5586             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    5587             :  * (GDAL &gt;= 2.2)</li>
    5588             :  * </ul>
    5589             :  * More options may be supported in the future.
    5590             :  *
    5591             :  * @param hSrcDS the source dataset
    5592             :  * @param hDstDS the destination dataset
    5593             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5594             :  * @param pfnProgress progress reporting function.
    5595             :  * @param pProgressData callback data for progress function.
    5596             :  *
    5597             :  * @return CE_None on success, or CE_Failure on failure.
    5598             :  */
    5599             : 
    5600        3378 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    5601             :                                               GDALDatasetH hDstDS,
    5602             :                                               CSLConstList papszOptions,
    5603             :                                               GDALProgressFunc pfnProgress,
    5604             :                                               void *pProgressData)
    5605             : 
    5606             : {
    5607        3378 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5608        3378 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    5609             : 
    5610        3378 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    5611        3378 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    5612             : 
    5613        3378 :     if (pfnProgress == nullptr)
    5614           0 :         pfnProgress = GDALDummyProgress;
    5615             : 
    5616             :     /* -------------------------------------------------------------------- */
    5617             :     /*      Confirm the datasets match in size and band counts.             */
    5618             :     /* -------------------------------------------------------------------- */
    5619        3378 :     const int nXSize = poDstDS->GetRasterXSize();
    5620        3378 :     const int nYSize = poDstDS->GetRasterYSize();
    5621        3378 :     const int nBandCount = poDstDS->GetRasterCount();
    5622             : 
    5623        3378 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    5624        6756 :         poSrcDS->GetRasterYSize() != nYSize ||
    5625        3378 :         poSrcDS->GetRasterCount() != nBandCount)
    5626             :     {
    5627           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5628             :                  "Input and output dataset sizes or band counts do not\n"
    5629             :                  "match in GDALDatasetCopyWholeRaster()");
    5630           0 :         return CE_Failure;
    5631             :     }
    5632             : 
    5633             :     /* -------------------------------------------------------------------- */
    5634             :     /*      Report preliminary (0) progress.                                */
    5635             :     /* -------------------------------------------------------------------- */
    5636        3378 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5637             :     {
    5638           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5639             :                  "User terminated CreateCopy()");
    5640           1 :         return CE_Failure;
    5641             :     }
    5642             : 
    5643             :     /* -------------------------------------------------------------------- */
    5644             :     /*      Get our prototype band, and assume the others are similarly     */
    5645             :     /*      configured.                                                     */
    5646             :     /* -------------------------------------------------------------------- */
    5647        3377 :     if (nBandCount == 0)
    5648           0 :         return CE_None;
    5649             : 
    5650        3377 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    5651        3377 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    5652        3377 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    5653             : 
    5654             :     /* -------------------------------------------------------------------- */
    5655             :     /*      Do we want to try and do the operation in a pixel               */
    5656             :     /*      interleaved fashion?                                            */
    5657             :     /* -------------------------------------------------------------------- */
    5658        3377 :     bool bInterleave = false;
    5659             :     const char *pszInterleave =
    5660        3377 :         poSrcDS->GetMetadataItem(GDALMD_INTERLEAVE, GDAL_MDD_IMAGE_STRUCTURE);
    5661        3377 :     if (pszInterleave != nullptr &&
    5662        2966 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5663         209 :         bInterleave = true;
    5664             : 
    5665             :     pszInterleave =
    5666        3377 :         poDstDS->GetMetadataItem(GDALMD_INTERLEAVE, GDAL_MDD_IMAGE_STRUCTURE);
    5667        3377 :     if (pszInterleave != nullptr &&
    5668        2911 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    5669         530 :         bInterleave = true;
    5670             : 
    5671        3377 :     pszInterleave = CSLFetchNameValue(papszOptions, GDALMD_INTERLEAVE);
    5672        3377 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    5673           5 :         bInterleave = true;
    5674        3372 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    5675          13 :         bInterleave = false;
    5676             :     // attributes is specific to the TileDB driver
    5677        3359 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    5678           4 :         bInterleave = true;
    5679        3355 :     else if (pszInterleave != nullptr)
    5680             :     {
    5681           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    5682             :                  "Unsupported value for option INTERLEAVE");
    5683             :     }
    5684             : 
    5685             :     // If the destination is compressed, we must try to write blocks just once,
    5686             :     // to save disk space (GTiff case for example), and to avoid data loss
    5687             :     // (JPEG compression for example).
    5688        3377 :     bool bDstIsCompressed = false;
    5689             :     const char *pszDstCompressed =
    5690        3377 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    5691        3377 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5692         400 :         bDstIsCompressed = true;
    5693             : 
    5694             :     /* -------------------------------------------------------------------- */
    5695             :     /*      What will our swath size be?                                    */
    5696             :     /* -------------------------------------------------------------------- */
    5697             : 
    5698        3377 :     int nSwathCols = 0;
    5699        3377 :     int nSwathLines = 0;
    5700        3377 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    5701             :                                     nBandCount, bDstIsCompressed, bInterleave,
    5702             :                                     &nSwathCols, &nSwathLines);
    5703             : 
    5704        3377 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5705        3377 :     if (bInterleave)
    5706         585 :         nPixelSize *= nBandCount;
    5707             : 
    5708        3377 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5709        3377 :     if (pSwathBuf == nullptr)
    5710             :     {
    5711           0 :         return CE_Failure;
    5712             :     }
    5713             : 
    5714        3377 :     CPLDebug("GDAL",
    5715             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    5716             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    5717             : 
    5718             :     // Advise the source raster that we are going to read it completely
    5719             :     // Note: this might already have been done by GDALCreateCopy() in the
    5720             :     // likely case this function is indirectly called by it
    5721        3377 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    5722        3377 :                         nullptr, nullptr);
    5723             : 
    5724             :     /* ==================================================================== */
    5725             :     /*      Band oriented (uninterleaved) case.                             */
    5726             :     /* ==================================================================== */
    5727        3377 :     CPLErr eErr = CE_None;
    5728             :     const bool bCheckHoles =
    5729        3377 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5730             : 
    5731        3377 :     if (!bInterleave)
    5732             :     {
    5733             :         GDALRasterIOExtraArg sExtraArg;
    5734        2792 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5735        2792 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5736             : 
    5737        8376 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    5738        2792 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    5739        2792 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    5740        2792 :         GIntBig nBlocksDone = 0;
    5741             : 
    5742        8027 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    5743             :         {
    5744        5235 :             int nBand = iBand + 1;
    5745             : 
    5746       10733 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5747             :             {
    5748        5498 :                 int nThisLines = nSwathLines;
    5749             : 
    5750        5498 :                 if (iY + nThisLines > nYSize)
    5751         375 :                     nThisLines = nYSize - iY;
    5752             : 
    5753       10996 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    5754        5498 :                      iX += nSwathCols)
    5755             :                 {
    5756        5498 :                     int nThisCols = nSwathCols;
    5757             : 
    5758        5498 :                     if (iX + nThisCols > nXSize)
    5759           0 :                         nThisCols = nXSize - iX;
    5760             : 
    5761        5498 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5762        5498 :                     if (bCheckHoles)
    5763             :                     {
    5764             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    5765        3780 :                                       ->GetDataCoverageStatus(
    5766             :                                           iX, iY, nThisCols, nThisLines,
    5767             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    5768             :                     }
    5769        5498 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5770             :                     {
    5771        5494 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    5772       10988 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    5773        5494 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    5774        5494 :                             (nBlocksDone + 0.5) /
    5775        5494 :                                 static_cast<double>(nTotalBlocks),
    5776             :                             pfnProgress, pProgressData);
    5777        5494 :                         if (sExtraArg.pProgressData == nullptr)
    5778        1688 :                             sExtraArg.pfnProgress = nullptr;
    5779             : 
    5780        5494 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5781             :                                                  nThisLines, pSwathBuf,
    5782             :                                                  nThisCols, nThisLines, eDT, 1,
    5783             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    5784             : 
    5785        5494 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5786             : 
    5787        5494 :                         if (eErr == CE_None)
    5788        5486 :                             eErr = poDstDS->RasterIO(
    5789             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    5790             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    5791             :                                 &nBand, 0, 0, 0, nullptr);
    5792             :                     }
    5793             : 
    5794        5498 :                     nBlocksDone++;
    5795       10953 :                     if (eErr == CE_None &&
    5796        5455 :                         !pfnProgress(nBlocksDone /
    5797        5455 :                                          static_cast<double>(nTotalBlocks),
    5798             :                                      nullptr, pProgressData))
    5799             :                     {
    5800           2 :                         eErr = CE_Failure;
    5801           2 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    5802             :                                  "User terminated CreateCopy()");
    5803             :                     }
    5804             :                 }
    5805             :             }
    5806             :         }
    5807             :     }
    5808             : 
    5809             :     /* ==================================================================== */
    5810             :     /*      Pixel interleaved case.                                         */
    5811             :     /* ==================================================================== */
    5812             :     else /* if( bInterleave ) */
    5813             :     {
    5814             :         GDALRasterIOExtraArg sExtraArg;
    5815         585 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5816         585 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    5817             : 
    5818         585 :         const GIntBig nTotalBlocks =
    5819         585 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    5820         585 :             DIV_ROUND_UP(nXSize, nSwathCols);
    5821         585 :         GIntBig nBlocksDone = 0;
    5822             : 
    5823        1392 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5824             :         {
    5825         807 :             int nThisLines = nSwathLines;
    5826             : 
    5827         807 :             if (iY + nThisLines > nYSize)
    5828         198 :                 nThisLines = nYSize - iY;
    5829             : 
    5830        1619 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5831             :             {
    5832         812 :                 int nThisCols = nSwathCols;
    5833             : 
    5834         812 :                 if (iX + nThisCols > nXSize)
    5835           3 :                     nThisCols = nXSize - iX;
    5836             : 
    5837         812 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5838         812 :                 if (bCheckHoles)
    5839             :                 {
    5840         553 :                     nStatus = 0;
    5841         606 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    5842             :                     {
    5843         587 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    5844         587 :                                        ->GetDataCoverageStatus(
    5845             :                                            iX, iY, nThisCols, nThisLines,
    5846             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    5847         587 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5848         534 :                             break;
    5849             :                     }
    5850             :                 }
    5851         812 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5852             :                 {
    5853         793 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    5854        1586 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    5855         793 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    5856         793 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    5857             :                         pfnProgress, pProgressData);
    5858         793 :                     if (sExtraArg.pProgressData == nullptr)
    5859         377 :                         sExtraArg.pfnProgress = nullptr;
    5860             : 
    5861         793 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    5862             :                                              nThisLines, pSwathBuf, nThisCols,
    5863             :                                              nThisLines, eDT, nBandCount,
    5864             :                                              nullptr, 0, 0, 0, &sExtraArg);
    5865             : 
    5866         793 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    5867             : 
    5868         793 :                     if (eErr == CE_None)
    5869         792 :                         eErr = poDstDS->RasterIO(
    5870             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    5871             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    5872             :                             0, 0, nullptr);
    5873             :                 }
    5874             : 
    5875         812 :                 nBlocksDone++;
    5876        1619 :                 if (eErr == CE_None &&
    5877         807 :                     !pfnProgress(nBlocksDone /
    5878         807 :                                      static_cast<double>(nTotalBlocks),
    5879             :                                  nullptr, pProgressData))
    5880             :                 {
    5881           1 :                     eErr = CE_Failure;
    5882           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    5883             :                              "User terminated CreateCopy()");
    5884             :                 }
    5885             :             }
    5886             :         }
    5887             :     }
    5888             : 
    5889             :     /* -------------------------------------------------------------------- */
    5890             :     /*      Cleanup                                                         */
    5891             :     /* -------------------------------------------------------------------- */
    5892        3377 :     CPLFree(pSwathBuf);
    5893             : 
    5894        3377 :     return eErr;
    5895             : }
    5896             : 
    5897             : /************************************************************************/
    5898             : /*                   GDALRasterBandCopyWholeRaster()                    */
    5899             : /************************************************************************/
    5900             : 
    5901             : /**
    5902             :  * \brief Copy a whole raster band
    5903             :  *
    5904             :  * This function copies the complete raster contents of one band to
    5905             :  * another similarly configured band.  The source and destination
    5906             :  * bands must have the same width and height.  The bands do not have
    5907             :  * to have the same data type.
    5908             :  *
    5909             :  * It implements efficient copying, in particular "chunking" the copy in
    5910             :  * substantial blocks.
    5911             :  *
    5912             :  * Currently the only papszOptions value supported are :
    5913             :  * <ul>
    5914             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    5915             :  * achieve best compression.</li>
    5916             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    5917             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    5918             :  * </ul>
    5919             :  *
    5920             :  * @param hSrcBand the source band
    5921             :  * @param hDstBand the destination band
    5922             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    5923             :  * @param pfnProgress progress reporting function.
    5924             :  * @param pProgressData callback data for progress function.
    5925             :  *
    5926             :  * @return CE_None on success, or CE_Failure on failure.
    5927             :  */
    5928             : 
    5929          29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    5930             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    5931             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    5932             :     void *pProgressData)
    5933             : 
    5934             : {
    5935          29 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5936          29 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5937             : 
    5938          29 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    5939          29 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    5940          29 :     CPLErr eErr = CE_None;
    5941             : 
    5942          29 :     if (pfnProgress == nullptr)
    5943           2 :         pfnProgress = GDALDummyProgress;
    5944             : 
    5945             :     /* -------------------------------------------------------------------- */
    5946             :     /*      Confirm the datasets match in size and band counts.             */
    5947             :     /* -------------------------------------------------------------------- */
    5948          29 :     int nXSize = poSrcBand->GetXSize();
    5949          29 :     int nYSize = poSrcBand->GetYSize();
    5950             : 
    5951          29 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    5952             :     {
    5953           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5954             :                  "Input and output band sizes do not\n"
    5955             :                  "match in GDALRasterBandCopyWholeRaster()");
    5956           0 :         return CE_Failure;
    5957             :     }
    5958             : 
    5959             :     /* -------------------------------------------------------------------- */
    5960             :     /*      Report preliminary (0) progress.                                */
    5961             :     /* -------------------------------------------------------------------- */
    5962          29 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5963             :     {
    5964           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5965             :                  "User terminated CreateCopy()");
    5966           0 :         return CE_Failure;
    5967             :     }
    5968             : 
    5969          29 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5970             : 
    5971             :     // If the destination is compressed, we must try to write blocks just once,
    5972             :     // to save disk space (GTiff case for example), and to avoid data loss
    5973             :     // (JPEG compression for example).
    5974          29 :     bool bDstIsCompressed = false;
    5975             :     const char *pszDstCompressed =
    5976          29 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5977          29 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5978          26 :         bDstIsCompressed = true;
    5979             : 
    5980             :     /* -------------------------------------------------------------------- */
    5981             :     /*      What will our swath size be?                                    */
    5982             :     /* -------------------------------------------------------------------- */
    5983             : 
    5984          29 :     int nSwathCols = 0;
    5985          29 :     int nSwathLines = 0;
    5986          29 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5987             :                                     FALSE, &nSwathCols, &nSwathLines);
    5988             : 
    5989          29 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5990             : 
    5991          29 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5992          29 :     if (pSwathBuf == nullptr)
    5993             :     {
    5994           0 :         return CE_Failure;
    5995             :     }
    5996             : 
    5997          29 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5998             :              nSwathCols, nSwathLines);
    5999             : 
    6000             :     const bool bCheckHoles =
    6001          29 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    6002             : 
    6003             :     // Advise the source raster that we are going to read it completely
    6004          29 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    6005             : 
    6006             :     /* ==================================================================== */
    6007             :     /*      Band oriented (uninterleaved) case.                             */
    6008             :     /* ==================================================================== */
    6009             : 
    6010          72 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    6011             :     {
    6012          43 :         int nThisLines = nSwathLines;
    6013             : 
    6014          43 :         if (iY + nThisLines > nYSize)
    6015           8 :             nThisLines = nYSize - iY;
    6016             : 
    6017          86 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    6018             :         {
    6019          43 :             int nThisCols = nSwathCols;
    6020             : 
    6021          43 :             if (iX + nThisCols > nXSize)
    6022           0 :                 nThisCols = nXSize - iX;
    6023             : 
    6024          43 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    6025          43 :             if (bCheckHoles)
    6026             :             {
    6027           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    6028             :                     iX, iY, nThisCols, nThisLines,
    6029             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    6030             :             }
    6031          43 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    6032             :             {
    6033          43 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    6034             :                                            nThisLines, pSwathBuf, nThisCols,
    6035             :                                            nThisLines, eDT, 0, 0, nullptr);
    6036             : 
    6037          43 :                 if (eErr == CE_None)
    6038          43 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    6039             :                                                nThisLines, pSwathBuf, nThisCols,
    6040             :                                                nThisLines, eDT, 0, 0, nullptr);
    6041             :             }
    6042             : 
    6043          86 :             if (eErr == CE_None && !pfnProgress(double(iY + nThisLines) /
    6044          43 :                                                     static_cast<double>(nYSize),
    6045             :                                                 nullptr, pProgressData))
    6046             :             {
    6047           0 :                 eErr = CE_Failure;
    6048           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    6049             :                          "User terminated CreateCopy()");
    6050             :             }
    6051             :         }
    6052             :     }
    6053             : 
    6054             :     /* -------------------------------------------------------------------- */
    6055             :     /*      Cleanup                                                         */
    6056             :     /* -------------------------------------------------------------------- */
    6057          29 :     CPLFree(pSwathBuf);
    6058             : 
    6059          29 :     return eErr;
    6060             : }
    6061             : 
    6062             : /************************************************************************/
    6063             : /*                     GDALCopyRasterIOExtraArg ()                      */
    6064             : /************************************************************************/
    6065             : 
    6066      534956 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    6067             :                               const GDALRasterIOExtraArg *psSrcArg)
    6068             : {
    6069      534956 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    6070      534956 :     if (psSrcArg)
    6071             :     {
    6072      534956 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    6073      534956 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    6074      534956 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    6075      534956 :         psDestArg->bFloatingPointWindowValidity =
    6076      534956 :             psSrcArg->bFloatingPointWindowValidity;
    6077      534956 :         if (psSrcArg->bFloatingPointWindowValidity)
    6078             :         {
    6079      211967 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    6080      211967 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    6081      211967 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    6082      211967 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    6083             :         }
    6084      534956 :         if (psSrcArg->nVersion >= 2)
    6085             :         {
    6086      534956 :             psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
    6087             :         }
    6088      534956 :         if (psSrcArg->nVersion >= 3)
    6089             :         {
    6090      534956 :             psDestArg->bOperateInBufType = psSrcArg->bOperateInBufType;
    6091             :         }
    6092             :     }
    6093      534956 : }
    6094             : 
    6095             : /************************************************************************/
    6096             : /*                           HasOnlyNoData()                            */
    6097             : /************************************************************************/
    6098             : 
    6099    51285976 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    6100             : {
    6101    51285976 :     return value == noDataValue;
    6102             : }
    6103             : 
    6104        5509 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
    6105             : {
    6106             :     using std::isnan;
    6107        5509 :     return isnan(noDataValue) ? isnan(value) : value == noDataValue;
    6108             : }
    6109             : 
    6110      251221 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    6111             : {
    6112      251221 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    6113             : }
    6114             : 
    6115      264259 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    6116             : {
    6117      264259 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    6118             : }
    6119             : 
    6120             : template <class T>
    6121       12026 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    6122             :                            size_t nHeight, size_t nLineStride,
    6123             :                            size_t nComponents)
    6124             : {
    6125             :     // Fast test: check the 4 corners and the middle pixel.
    6126       23299 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    6127             :     {
    6128       24097 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    6129       11880 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    6130       11750 :                               noDataValue) &&
    6131       11750 :               IsEqualToNoData(
    6132       11750 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    6133       11750 :                               nComponents +
    6134             :                           iBand],
    6135       11276 :                   noDataValue) &&
    6136       11276 :               IsEqualToNoData(
    6137       11276 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    6138             :                   noDataValue) &&
    6139       11276 :               IsEqualToNoData(
    6140       11276 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    6141       11276 :                               nComponents +
    6142             :                           iBand],
    6143             :                   noDataValue)))
    6144             :         {
    6145         944 :             return false;
    6146             :         }
    6147             :     }
    6148             : 
    6149             :     // Test all pixels.
    6150       52954 :     for (size_t iY = 0; iY < nHeight; iY++)
    6151             :     {
    6152       41993 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    6153    51790448 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    6154             :         {
    6155    51748615 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    6156             :             {
    6157         121 :                 return false;
    6158             :             }
    6159             :         }
    6160             :     }
    6161       10961 :     return true;
    6162             : }
    6163             : 
    6164             : /************************************************************************/
    6165             : /*                      GDALBufferHasOnlyNoData()                       */
    6166             : /************************************************************************/
    6167             : 
    6168       44014 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    6169             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    6170             :                              size_t nComponents, int nBitsPerSample,
    6171             :                              GDALBufferSampleFormat nSampleFormat)
    6172             : {
    6173             :     // In the case where the nodata is 0, we can compare several bytes at
    6174             :     // once. Select the largest natural integer type for the architecture.
    6175       44014 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6176             :         // Do not use this optimized code path for floating point numbers,
    6177             :         // as it can't detect negative zero.
    6178             :         nSampleFormat != GSF_FLOATING_POINT)
    6179             :     {
    6180       27266 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6181       27266 :         const size_t nSize =
    6182       27266 :             static_cast<size_t>((static_cast<uint64_t>(nWidth) * nHeight *
    6183       27266 :                                      nComponents * nBitsPerSample +
    6184             :                                  7) /
    6185             :                                 8);
    6186             : #ifdef HAVE_SSE2
    6187       27266 :         size_t n = nSize;
    6188             :         // Align to 16 bytes
    6189       27329 :         while ((reinterpret_cast<uintptr_t>(pabyBuffer) & 15) != 0 && n > 0)
    6190             :         {
    6191          73 :             --n;
    6192          73 :             if (*pabyBuffer)
    6193          10 :                 return false;
    6194          63 :             pabyBuffer++;
    6195             :         }
    6196             : 
    6197       27256 :         const auto zero = _mm_setzero_si128();
    6198       27256 :         constexpr int UNROLLING = 4;
    6199     2223230 :         while (n >= UNROLLING * sizeof(zero))
    6200             :         {
    6201     2207980 :             const auto v0 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6202             :                 pabyBuffer + 0 * sizeof(zero)));
    6203     2207980 :             const auto v1 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6204     2207980 :                 pabyBuffer + 1 * sizeof(zero)));
    6205     2207980 :             const auto v2 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6206     2207980 :                 pabyBuffer + 2 * sizeof(zero)));
    6207     2207980 :             const auto v3 = _mm_load_si128(reinterpret_cast<const __m128i *>(
    6208     2207980 :                 pabyBuffer + 3 * sizeof(zero)));
    6209             :             const auto v =
    6210     6623940 :                 _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6211             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    6212             :             if (!_mm_test_all_zeros(v, v))
    6213             : #else
    6214     4415960 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6215             : #endif
    6216             :             {
    6217       12002 :                 return false;
    6218             :             }
    6219     2195980 :             pabyBuffer += UNROLLING * sizeof(zero);
    6220     2195980 :             n -= UNROLLING * sizeof(zero);
    6221             :         }
    6222             : 
    6223      233639 :         while (n > 0)
    6224             :         {
    6225      218489 :             --n;
    6226      218489 :             if (*pabyBuffer)
    6227         104 :                 return false;
    6228      218385 :             pabyBuffer++;
    6229             :         }
    6230             : #else
    6231             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    6232             :         // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    6233             :         typedef std::uint64_t WordType;
    6234             : #else
    6235             :         typedef std::uint32_t WordType;
    6236             : #endif
    6237             : 
    6238             :         const size_t nInitialIters =
    6239             :             std::min(sizeof(WordType) -
    6240             :                          static_cast<size_t>(
    6241             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    6242             :                              sizeof(WordType)),
    6243             :                      nSize);
    6244             :         size_t i = 0;
    6245             :         for (; i < nInitialIters; i++)
    6246             :         {
    6247             :             if (pabyBuffer[i])
    6248             :                 return false;
    6249             :         }
    6250             :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    6251             :         {
    6252             :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    6253             :                 return false;
    6254             :         }
    6255             :         for (; i < nSize; i++)
    6256             :         {
    6257             :             if (pabyBuffer[i])
    6258             :                 return false;
    6259             :         }
    6260             : #endif
    6261       15150 :         return true;
    6262             :     }
    6263             : 
    6264             : #ifdef HAVE_SSE2
    6265       16748 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6266         710 :              nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6267             :     {
    6268         710 :         const auto signMask = _mm_set1_epi32(0x7FFFFFFF);
    6269         710 :         const auto zero = _mm_setzero_si128();
    6270         710 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6271         710 :         const size_t n = nWidth * nHeight * nComponents;
    6272             : 
    6273         710 :         size_t i = 0;
    6274         710 :         constexpr int UNROLLING = 4;
    6275         710 :         constexpr size_t VALUES_PER_ITER =
    6276             :             UNROLLING * sizeof(zero) / sizeof(float);
    6277       24985 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    6278             :         {
    6279       24936 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6280             :                 pabyBuffer + 0 * sizeof(zero)));
    6281       24936 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6282       24936 :                 pabyBuffer + 1 * sizeof(zero)));
    6283       24936 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6284       24936 :                 pabyBuffer + 2 * sizeof(zero)));
    6285       24936 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6286       24936 :                 pabyBuffer + 3 * sizeof(zero)));
    6287       74808 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6288             :             // Clear the sign bit (makes -0.0 become +0.0)
    6289       24936 :             v = _mm_and_si128(v, signMask);
    6290             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    6291             :             if (!_mm_test_all_zeros(v, v))
    6292             : #else
    6293       49872 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6294             : #endif
    6295             :             {
    6296         661 :                 return false;
    6297             :             }
    6298       24275 :             pabyBuffer += UNROLLING * sizeof(zero);
    6299             :         }
    6300             : 
    6301         304 :         for (; i < n; i++)
    6302             :         {
    6303             :             uint32_t bits;
    6304         272 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6305         272 :             pabyBuffer += sizeof(bits);
    6306         272 :             if ((bits & 0x7FFFFFFF) != 0)
    6307          17 :                 return false;
    6308             :         }
    6309             : 
    6310          32 :         return true;
    6311             :     }
    6312             : 
    6313       16038 :     else if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    6314        4005 :              nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6315             :     {
    6316        4005 :         const auto signMask = _mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL);
    6317        4005 :         const auto zero = _mm_setzero_si128();
    6318        4005 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    6319        4005 :         const size_t n = nWidth * nHeight * nComponents;
    6320             : 
    6321        4005 :         size_t i = 0;
    6322        4005 :         constexpr int UNROLLING = 4;
    6323        4005 :         constexpr size_t VALUES_PER_ITER =
    6324             :             UNROLLING * sizeof(zero) / sizeof(double);
    6325     1664960 :         for (; i + VALUES_PER_ITER <= n; i += VALUES_PER_ITER)
    6326             :         {
    6327     1661340 :             const auto v0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6328             :                 pabyBuffer + 0 * sizeof(zero)));
    6329     1661340 :             const auto v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6330     1661340 :                 pabyBuffer + 1 * sizeof(zero)));
    6331     1661340 :             const auto v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6332     1661340 :                 pabyBuffer + 2 * sizeof(zero)));
    6333     1661340 :             const auto v3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
    6334     1661340 :                 pabyBuffer + 3 * sizeof(zero)));
    6335     4984020 :             auto v = _mm_or_si128(_mm_or_si128(v0, v1), _mm_or_si128(v2, v3));
    6336             :             // Clear the sign bit (makes -0.0 become +0.0)
    6337     1661340 :             v = _mm_and_si128(v, signMask);
    6338             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    6339             :             if (!_mm_test_all_zeros(v, v))
    6340             : #else
    6341     3322680 :             if (_mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) != 0xFFFF)
    6342             : #endif
    6343             :             {
    6344         389 :                 return false;
    6345             :             }
    6346     1660950 :             pabyBuffer += UNROLLING * sizeof(zero);
    6347             :         }
    6348             : 
    6349        3643 :         for (; i < n; i++)
    6350             :         {
    6351             :             uint64_t bits;
    6352          34 :             memcpy(&bits, pabyBuffer, sizeof(bits));
    6353          34 :             pabyBuffer += sizeof(bits);
    6354          34 :             if ((bits & 0x7FFFFFFFFFFFFFFFULL) != 0)
    6355           7 :                 return false;
    6356             :         }
    6357             : 
    6358        3609 :         return true;
    6359             :     }
    6360             : #endif
    6361             : 
    6362       12033 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    6363             :     {
    6364       22424 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    6365       11212 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    6366       11212 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    6367       11212 :                               nHeight, nLineStride, nComponents);
    6368             :     }
    6369         821 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    6370             :     {
    6371             :         // Use unsigned implementation by converting the nodatavalue to
    6372             :         // unsigned
    6373         119 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    6374          59 :                HasOnlyNoDataT(
    6375             :                    static_cast<const uint8_t *>(pBuffer),
    6376          59 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    6377          60 :                    nWidth, nHeight, nLineStride, nComponents);
    6378             :     }
    6379         761 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    6380             :     {
    6381          23 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    6382          11 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    6383          11 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    6384          12 :                               nHeight, nLineStride, nComponents);
    6385             :     }
    6386         749 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    6387             :     {
    6388             :         // Use unsigned implementation by converting the nodatavalue to
    6389             :         // unsigned
    6390         111 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    6391          55 :                HasOnlyNoDataT(
    6392             :                    static_cast<const uint16_t *>(pBuffer),
    6393          55 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    6394          56 :                    nWidth, nHeight, nLineStride, nComponents);
    6395             :     }
    6396         693 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    6397             :     {
    6398         129 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    6399          64 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    6400             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    6401          65 :                               nHeight, nLineStride, nComponents);
    6402             :     }
    6403         628 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    6404             :     {
    6405             :         // Use unsigned implementation by converting the nodatavalue to
    6406             :         // unsigned
    6407          23 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    6408          11 :                HasOnlyNoDataT(
    6409             :                    static_cast<const uint32_t *>(pBuffer),
    6410          11 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    6411          12 :                    nWidth, nHeight, nLineStride, nComponents);
    6412             :     }
    6413         616 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    6414             :     {
    6415         112 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    6416          56 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    6417             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    6418          56 :                               nHeight, nLineStride, nComponents);
    6419             :     }
    6420         560 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    6421             :     {
    6422             :         // Use unsigned implementation by converting the nodatavalue to
    6423             :         // unsigned
    6424           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    6425           0 :                HasOnlyNoDataT(
    6426             :                    static_cast<const uint64_t *>(pBuffer),
    6427           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    6428           0 :                    nWidth, nHeight, nLineStride, nComponents);
    6429             :     }
    6430         560 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
    6431             :     {
    6432         106 :         return (std::isnan(dfNoDataValue) ||
    6433         211 :                 GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
    6434         105 :                HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
    6435             :                               static_cast<GFloat16>(dfNoDataValue), nWidth,
    6436         106 :                               nHeight, nLineStride, nComponents);
    6437             :     }
    6438         454 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    6439             :     {
    6440         268 :         return (std::isnan(dfNoDataValue) ||
    6441         535 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    6442         267 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    6443             :                               static_cast<float>(dfNoDataValue), nWidth,
    6444         268 :                               nHeight, nLineStride, nComponents);
    6445             :     }
    6446         186 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    6447             :     {
    6448         186 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    6449             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    6450         186 :                               nComponents);
    6451             :     }
    6452           0 :     return false;
    6453             : }
    6454             : 
    6455             : #ifdef HAVE_SSE2
    6456             : 
    6457             : /************************************************************************/
    6458             : /*                       GDALDeinterleave3Byte()                        */
    6459             : /************************************************************************/
    6460             : 
    6461             : #if defined(__GNUC__) && !defined(__clang__)
    6462             : __attribute__((optimize("no-tree-vectorize")))
    6463             : #endif
    6464      383026 : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6465             :                                   GByte *CPL_RESTRICT pabyDest0,
    6466             :                                   GByte *CPL_RESTRICT pabyDest1,
    6467             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6468             : #ifdef USE_NEON_OPTIMIZATIONS
    6469             : {
    6470             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6471             :                                        nIters);
    6472             : }
    6473             : #else
    6474             : {
    6475             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6476      383026 :     if (CPLHaveRuntimeSSSE3())
    6477             :     {
    6478      383024 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6479      383024 :                                            pabyDest2, nIters);
    6480             :     }
    6481             : #endif
    6482             : 
    6483           2 :     size_t i = 0;
    6484           2 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    6485           2 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    6486           2 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    6487           2 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    6488             :          sizeof(unsigned int)) == 0)
    6489             :     {
    6490             :         // Slightly better than GCC autovectorizer
    6491          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    6492             :         {
    6493          15 :             unsigned int word0 =
    6494          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    6495          15 :             unsigned int word1 =
    6496          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    6497          15 :             unsigned int word2 =
    6498          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    6499          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    6500          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    6501          15 :                 ((word2 >> 8) << 24);
    6502          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    6503          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    6504          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    6505          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    6506          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    6507          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    6508          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    6509             :         }
    6510             :     }
    6511             : #if defined(__clang__)
    6512             : #pragma clang loop vectorize(disable)
    6513             : #endif
    6514           3 :     for (; i < nIters; ++i)
    6515             :     {
    6516           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    6517           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    6518           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    6519             :     }
    6520             : }
    6521             : #endif
    6522             : 
    6523             : /************************************************************************/
    6524             : /*                       GDALDeinterleave4Byte()                        */
    6525             : /************************************************************************/
    6526             : 
    6527             : #if !defined(__GNUC__) || defined(__clang__)
    6528             : 
    6529             : /************************************************************************/
    6530             : /*                            deinterleave()                            */
    6531             : /************************************************************************/
    6532             : 
    6533             : template <bool SHIFT, bool MASK>
    6534             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    6535             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    6536             : {
    6537             :     // Set higher 24bit of each int32 packed word to 0
    6538             :     if (SHIFT)
    6539             :     {
    6540             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    6541             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    6542             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    6543             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    6544             :     }
    6545             :     __m128i xmm0;
    6546             :     __m128i xmm1;
    6547             :     __m128i xmm2;
    6548             :     __m128i xmm3;
    6549             :     if (MASK)
    6550             :     {
    6551             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    6552             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    6553             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    6554             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    6555             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    6556             :     }
    6557             :     else
    6558             :     {
    6559             :         xmm0 = xmm0_ori;
    6560             :         xmm1 = xmm1_ori;
    6561             :         xmm2 = xmm2_ori;
    6562             :         xmm3 = xmm3_ori;
    6563             :     }
    6564             :     // Pack int32 to int16
    6565             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    6566             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    6567             :     // Pack int16 to uint8
    6568             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    6569             :     return xmm0;
    6570             : }
    6571             : 
    6572             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6573             :                                   GByte *CPL_RESTRICT pabyDest0,
    6574             :                                   GByte *CPL_RESTRICT pabyDest1,
    6575             :                                   GByte *CPL_RESTRICT pabyDest2,
    6576             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6577             : #ifdef USE_NEON_OPTIMIZATIONS
    6578             : {
    6579             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6580             :                                        pabyDest3, nIters);
    6581             : }
    6582             : #else
    6583             : {
    6584             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    6585             :     if (CPLHaveRuntimeSSSE3())
    6586             :     {
    6587             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    6588             :                                            pabyDest2, pabyDest3, nIters);
    6589             :     }
    6590             : #endif
    6591             : 
    6592             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    6593             :     // do something slightly better.
    6594             :     size_t i = 0;
    6595             :     for (; i + 15 < nIters; i += 16)
    6596             :     {
    6597             :         __m128i xmm0_ori = _mm_loadu_si128(
    6598             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    6599             :         __m128i xmm1_ori = _mm_loadu_si128(
    6600             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    6601             :         __m128i xmm2_ori = _mm_loadu_si128(
    6602             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    6603             :         __m128i xmm3_ori = _mm_loadu_si128(
    6604             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    6605             : 
    6606             :         _mm_storeu_si128(
    6607             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    6608             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6609             :         _mm_storeu_si128(
    6610             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    6611             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6612             :         _mm_storeu_si128(
    6613             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    6614             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6615             :         _mm_storeu_si128(
    6616             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    6617             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    6618             :     }
    6619             : 
    6620             : #if defined(__clang__)
    6621             : #pragma clang loop vectorize(disable)
    6622             : #endif
    6623             :     for (; i < nIters; ++i)
    6624             :     {
    6625             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6626             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6627             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6628             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6629             :     }
    6630             : }
    6631             : #endif
    6632             : #else
    6633             : // GCC autovectorizer does an excellent job
    6634       97789 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    6635             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    6636             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    6637             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6638             : {
    6639   545374000 :     for (size_t i = 0; i < nIters; ++i)
    6640             :     {
    6641   545276000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    6642   545276000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    6643   545276000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    6644   545276000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    6645             :     }
    6646       97789 : }
    6647             : #endif
    6648             : 
    6649             : #else
    6650             : 
    6651             : /************************************************************************/
    6652             : /*                       GDALDeinterleave3Byte()                        */
    6653             : /************************************************************************/
    6654             : 
    6655             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    6656             : // how to auto-vectorize
    6657             : // #if defined(__GNUC__)
    6658             : //__attribute__((optimize("tree-vectorize")))
    6659             : // #endif
    6660             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    6661             :                                   GByte *CPL_RESTRICT pabyDest0,
    6662             :                                   GByte *CPL_RESTRICT pabyDest1,
    6663             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    6664             : {
    6665             :     for (size_t i = 0; i < nIters; ++i)
    6666             :     {
    6667             :         pabyDest0[i] = pabySrc[3 * i + 0];
    6668             :         pabyDest1[i] = pabySrc[3 * i + 1];
    6669             :         pabyDest2[i] = pabySrc[3 * i + 2];
    6670             :     }
    6671             : }
    6672             : 
    6673             : /************************************************************************/
    6674             : /*                       GDALDeinterleave4Byte()                        */
    6675             : /************************************************************************/
    6676             : 
    6677             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    6678             : // how to auto-vectorize
    6679             : // #if defined(__GNUC__)
    6680             : //__attribute__((optimize("tree-vectorize")))
    6681             : // #endif
    6682             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    6683             :                                   GByte *CPL_RESTRICT pabyDest0,
    6684             :                                   GByte *CPL_RESTRICT pabyDest1,
    6685             :                                   GByte *CPL_RESTRICT pabyDest2,
    6686             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    6687             : {
    6688             :     for (size_t i = 0; i < nIters; ++i)
    6689             :     {
    6690             :         pabyDest0[i] = pabySrc[4 * i + 0];
    6691             :         pabyDest1[i] = pabySrc[4 * i + 1];
    6692             :         pabyDest2[i] = pabySrc[4 * i + 2];
    6693             :         pabyDest3[i] = pabySrc[4 * i + 3];
    6694             :     }
    6695             : }
    6696             : 
    6697             : #endif
    6698             : 
    6699             : /************************************************************************/
    6700             : /*                          GDALDeinterleave()                          */
    6701             : /************************************************************************/
    6702             : 
    6703             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    6704             :     buffers.
    6705             : 
    6706             :     In pseudo-code
    6707             :     \verbatim
    6708             :     for(size_t i = 0; i < nIters; ++i)
    6709             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    6710             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    6711             :     \endverbatim
    6712             : 
    6713             :     The implementation is optimized for a few cases, like de-interleaving
    6714             :     of 3 or 4-components Byte buffers.
    6715             : 
    6716             :     \since GDAL 3.6
    6717             :  */
    6718      481165 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    6719             :                       int nComponents, void **ppDestBuffer,
    6720             :                       GDALDataType eDestDT, size_t nIters)
    6721             : {
    6722      481165 :     if (eSourceDT == eDestDT)
    6723             :     {
    6724      481143 :         if (eSourceDT == GDT_UInt8 || eSourceDT == GDT_Int8)
    6725             :         {
    6726      480822 :             if (nComponents == 3)
    6727             :             {
    6728      383026 :                 const GByte *CPL_RESTRICT pabySrc =
    6729             :                     static_cast<const GByte *>(pSourceBuffer);
    6730      383026 :                 GByte *CPL_RESTRICT pabyDest0 =
    6731             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6732      383026 :                 GByte *CPL_RESTRICT pabyDest1 =
    6733             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6734      383026 :                 GByte *CPL_RESTRICT pabyDest2 =
    6735             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6736      383026 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6737             :                                       nIters);
    6738      383026 :                 return;
    6739             :             }
    6740       97796 :             else if (nComponents == 4)
    6741             :             {
    6742       97789 :                 const GByte *CPL_RESTRICT pabySrc =
    6743             :                     static_cast<const GByte *>(pSourceBuffer);
    6744       97789 :                 GByte *CPL_RESTRICT pabyDest0 =
    6745             :                     static_cast<GByte *>(ppDestBuffer[0]);
    6746       97789 :                 GByte *CPL_RESTRICT pabyDest1 =
    6747             :                     static_cast<GByte *>(ppDestBuffer[1]);
    6748       97789 :                 GByte *CPL_RESTRICT pabyDest2 =
    6749             :                     static_cast<GByte *>(ppDestBuffer[2]);
    6750       97789 :                 GByte *CPL_RESTRICT pabyDest3 =
    6751             :                     static_cast<GByte *>(ppDestBuffer[3]);
    6752       97789 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    6753             :                                       pabyDest3, nIters);
    6754       97789 :                 return;
    6755           7 :             }
    6756             :         }
    6757             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    6758             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    6759             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    6760         642 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    6761         321 :                  CPLHaveRuntimeSSSE3())
    6762             :         {
    6763         321 :             if (nComponents == 3)
    6764             :             {
    6765         126 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6766             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6767         126 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6768             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6769         126 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6770             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6771         126 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6772             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6773         126 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    6774             :                                               panDest2, nIters);
    6775         126 :                 return;
    6776             :             }
    6777             : #if !defined(__INTEL_CLANG_COMPILER)
    6778             :             // ICC autovectorizer doesn't do a good job, at least with icx
    6779             :             // 2022.1.0.20220316
    6780         195 :             else if (nComponents == 4)
    6781             :             {
    6782         195 :                 const GUInt16 *CPL_RESTRICT panSrc =
    6783             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    6784         195 :                 GUInt16 *CPL_RESTRICT panDest0 =
    6785             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    6786         195 :                 GUInt16 *CPL_RESTRICT panDest1 =
    6787             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    6788         195 :                 GUInt16 *CPL_RESTRICT panDest2 =
    6789             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    6790         195 :                 GUInt16 *CPL_RESTRICT panDest3 =
    6791             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    6792         195 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    6793             :                                               panDest2, panDest3, nIters);
    6794         195 :                 return;
    6795             :             }
    6796             : #endif
    6797             :         }
    6798             : #endif
    6799             :     }
    6800             : 
    6801          29 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    6802          29 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    6803         108 :     for (int iComp = 0; iComp < nComponents; iComp++)
    6804             :     {
    6805          79 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    6806          79 :                             iComp * nSourceDTSize,
    6807             :                         eSourceDT, nComponents * nSourceDTSize,
    6808          79 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    6809             :     }
    6810             : }
    6811             : 
    6812             : /************************************************************************/
    6813             : /*                   GDALTranspose2DSingleToSingle()                    */
    6814             : /************************************************************************/
    6815             : /**
    6816             :  * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
    6817             :  *
    6818             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6819             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6820             :  * @param nSrcWidth Width of pSrc array.
    6821             :  * @param nSrcHeight Height of pSrc array.
    6822             :  */
    6823             : 
    6824             : template <class DST, class SRC>
    6825         160 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
    6826             :                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6827             :                                    size_t nSrcHeight)
    6828             : {
    6829         160 :     constexpr size_t blocksize = 32;
    6830         345 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6831             :     {
    6832         185 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6833        5016 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6834             :         {
    6835             :             // transpose the block beginning at [i,j]
    6836        4831 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6837       26185 :             for (size_t k = i; k < max_k; ++k)
    6838             :             {
    6839      669282 :                 for (size_t l = j; l < max_l; ++l)
    6840             :                 {
    6841      647928 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6842      647928 :                                  pDst[k + l * nSrcHeight]);
    6843             :                 }
    6844             :             }
    6845             :         }
    6846             :     }
    6847         160 : }
    6848             : 
    6849             : /************************************************************************/
    6850             : /*                  GDALTranspose2DComplexToComplex()                   */
    6851             : /************************************************************************/
    6852             : /**
    6853             :  * Transpose a 2D array of complex values into an array of complex values,
    6854             :  * in a efficient (cache-oblivious) way.
    6855             :  *
    6856             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6857             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6858             :  * @param nSrcWidth Width of pSrc array.
    6859             :  * @param nSrcHeight Height of pSrc array.
    6860             :  */
    6861             : template <class DST, class SRC>
    6862          25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
    6863             :                                      DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6864             :                                      size_t nSrcHeight)
    6865             : {
    6866          25 :     constexpr size_t blocksize = 32;
    6867          50 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6868             :     {
    6869          25 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6870          50 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6871             :         {
    6872             :             // transpose the block beginning at [i,j]
    6873          25 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6874          75 :             for (size_t k = i; k < max_k; ++k)
    6875             :             {
    6876         200 :                 for (size_t l = j; l < max_l; ++l)
    6877             :                 {
    6878         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6879         150 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6880         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
    6881         150 :                                  pDst[2 * (k + l * nSrcHeight) + 1]);
    6882             :                 }
    6883             :             }
    6884             :         }
    6885             :     }
    6886          25 : }
    6887             : 
    6888             : /************************************************************************/
    6889             : /*                   GDALTranspose2DComplexToSingle()                   */
    6890             : /************************************************************************/
    6891             : /**
    6892             :  * Transpose a 2D array of complex values into an array of non-complex values,
    6893             :  * in a efficient (cache-oblivious) way.
    6894             :  *
    6895             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6896             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6897             :  * @param nSrcWidth Width of pSrc array.
    6898             :  * @param nSrcHeight Height of pSrc array.
    6899             :  */
    6900             : template <class DST, class SRC>
    6901          55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
    6902             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6903             :                                     size_t nSrcHeight)
    6904             : {
    6905          55 :     constexpr size_t blocksize = 32;
    6906         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6907             :     {
    6908          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6909         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6910             :         {
    6911             :             // transpose the block beginning at [i,j]
    6912          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6913         165 :             for (size_t k = i; k < max_k; ++k)
    6914             :             {
    6915         440 :                 for (size_t l = j; l < max_l; ++l)
    6916             :                 {
    6917         330 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    6918         330 :                                  pDst[k + l * nSrcHeight]);
    6919             :                 }
    6920             :             }
    6921             :         }
    6922             :     }
    6923          55 : }
    6924             : 
    6925             : /************************************************************************/
    6926             : /*                   GDALTranspose2DSingleToComplex()                   */
    6927             : /************************************************************************/
    6928             : /**
    6929             :  * Transpose a 2D array of non-complex values into an array of complex values,
    6930             :  * in a efficient (cache-oblivious) way.
    6931             :  *
    6932             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    6933             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    6934             :  * @param nSrcWidth Width of pSrc array.
    6935             :  * @param nSrcHeight Height of pSrc array.
    6936             :  */
    6937             : template <class DST, class SRC>
    6938          55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
    6939             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    6940             :                                     size_t nSrcHeight)
    6941             : {
    6942          55 :     constexpr size_t blocksize = 32;
    6943         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    6944             :     {
    6945          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    6946         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    6947             :         {
    6948             :             // transpose the block beginning at [i,j]
    6949          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    6950         165 :             for (size_t k = i; k < max_k; ++k)
    6951             :             {
    6952         440 :                 for (size_t l = j; l < max_l; ++l)
    6953             :                 {
    6954         330 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    6955         330 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    6956         330 :                     pDst[2 * (k + l * nSrcHeight) + 1] = 0;
    6957             :                 }
    6958             :             }
    6959             :         }
    6960             :     }
    6961          55 : }
    6962             : 
    6963             : /************************************************************************/
    6964             : /*                          GDALTranspose2D()                           */
    6965             : /************************************************************************/
    6966             : 
    6967             : template <class DST, bool DST_IS_COMPLEX>
    6968         295 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
    6969             :                             size_t nSrcWidth, size_t nSrcHeight)
    6970             : {
    6971             : #define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
    6972             :     do                                                                         \
    6973             :     {                                                                          \
    6974             :         if constexpr (DST_IS_COMPLEX)                                          \
    6975             :         {                                                                      \
    6976             :             GDALTranspose2DSingleToComplex(                                    \
    6977             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6978             :                 nSrcHeight);                                                   \
    6979             :         }                                                                      \
    6980             :         else                                                                   \
    6981             :         {                                                                      \
    6982             :             GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
    6983             :                                           pDst, nSrcWidth, nSrcHeight);        \
    6984             :         }                                                                      \
    6985             :     } while (0)
    6986             : 
    6987             : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
    6988             :     do                                                                         \
    6989             :     {                                                                          \
    6990             :         if constexpr (DST_IS_COMPLEX)                                          \
    6991             :         {                                                                      \
    6992             :             GDALTranspose2DComplexToComplex(                                   \
    6993             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    6994             :                 nSrcHeight);                                                   \
    6995             :         }                                                                      \
    6996             :         else                                                                   \
    6997             :         {                                                                      \
    6998             :             GDALTranspose2DComplexToSingle(                                    \
    6999             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    7000             :                 nSrcHeight);                                                   \
    7001             :         }                                                                      \
    7002             :     } while (0)
    7003             : 
    7004             :     // clang-format off
    7005         295 :     switch (eSrcType)
    7006             :     {
    7007          16 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t); break;
    7008          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
    7009          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
    7010          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
    7011          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
    7012          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
    7013          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
    7014          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
    7015          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
    7016          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
    7017          24 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
    7018          16 :         case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
    7019          16 :         case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
    7020          16 :         case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
    7021          16 :         case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
    7022          16 :         case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
    7023           0 :         case GDT_Unknown:
    7024             :         case GDT_TypeCount:
    7025           0 :             break;
    7026             :     }
    7027             :         // clang-format on
    7028             : 
    7029             : #undef CALL_GDALTranspose2D_internal
    7030             : #undef CALL_GDALTranspose2DComplex_internal
    7031         295 : }
    7032             : 
    7033             : /************************************************************************/
    7034             : /*                        GDALInterleave2Byte()                         */
    7035             : /************************************************************************/
    7036             : 
    7037             : #if defined(HAVE_SSE2) &&                                                      \
    7038             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    7039             : 
    7040             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    7041             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    7042             : #if defined(__GNUC__)
    7043             : __attribute__((noinline))
    7044             : #endif
    7045             : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    7046             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    7047             : {
    7048             :     size_t i = 0;
    7049             :     constexpr size_t VALS_PER_ITER = 16;
    7050             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    7051             :     {
    7052             :         __m128i xmm0 =
    7053             :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
    7054             :         __m128i xmm1 = _mm_loadu_si128(
    7055             :             reinterpret_cast<__m128i const *>(pSrc + i + nIters));
    7056             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
    7057             :                          _mm_unpacklo_epi8(xmm0, xmm1));
    7058             :         _mm_storeu_si128(
    7059             :             reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
    7060             :             _mm_unpackhi_epi8(xmm0, xmm1));
    7061             :     }
    7062             : #if defined(__clang__)
    7063             : #pragma clang loop vectorize(disable)
    7064             : #endif
    7065             :     for (; i < nIters; ++i)
    7066             :     {
    7067             :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    7068             :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    7069             :     }
    7070             : }
    7071             : 
    7072             : #else
    7073             : 
    7074             : #if defined(__GNUC__) && !defined(__clang__)
    7075             : __attribute__((optimize("tree-vectorize")))
    7076             : #endif
    7077             : #if defined(__GNUC__)
    7078             : __attribute__((noinline))
    7079             : #endif
    7080             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7081             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    7082             : #pragma clang diagnostic push
    7083             : #pragma clang diagnostic ignored "-Wpass-failed"
    7084             : #endif
    7085           9 : static void GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    7086             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    7087             : {
    7088             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7089             : #pragma clang loop vectorize(enable)
    7090             : #endif
    7091      355429 :     for (size_t i = 0; i < nIters; ++i)
    7092             :     {
    7093      355420 :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    7094      355420 :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    7095             :     }
    7096           9 : }
    7097             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7098             : #pragma clang diagnostic pop
    7099             : #endif
    7100             : 
    7101             : #endif
    7102             : 
    7103             : /************************************************************************/
    7104             : /*                        GDALInterleave4Byte()                         */
    7105             : /************************************************************************/
    7106             : 
    7107             : #if defined(HAVE_SSE2) &&                                                      \
    7108             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    7109             : 
    7110             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    7111             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    7112             : #if defined(__GNUC__)
    7113             : __attribute__((noinline))
    7114             : #endif
    7115             : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    7116             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    7117             : {
    7118             :     size_t i = 0;
    7119             :     constexpr size_t VALS_PER_ITER = 16;
    7120             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    7121             :     {
    7122             :         __m128i xmm0 = _mm_loadu_si128(
    7123             :             reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
    7124             :         __m128i xmm1 = _mm_loadu_si128(
    7125             :             reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
    7126             :         __m128i xmm2 = _mm_loadu_si128(
    7127             :             reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
    7128             :         __m128i xmm3 = _mm_loadu_si128(
    7129             :             reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
    7130             :         auto tmp0 = _mm_unpacklo_epi8(
    7131             :             xmm0,
    7132             :             xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
    7133             :         auto tmp1 = _mm_unpackhi_epi8(
    7134             :             xmm0,
    7135             :             xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
    7136             :         auto tmp2 = _mm_unpacklo_epi8(
    7137             :             xmm2,
    7138             :             xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
    7139             :         auto tmp3 = _mm_unpackhi_epi8(
    7140             :             xmm2,
    7141             :             xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
    7142             :         auto tmp2_0 = _mm_unpacklo_epi16(
    7143             :             tmp0,
    7144             :             tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
    7145             :         auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
    7146             :         auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
    7147             :         auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
    7148             :         _mm_storeu_si128(
    7149             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
    7150             :             tmp2_0);
    7151             :         _mm_storeu_si128(
    7152             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
    7153             :             tmp2_1);
    7154             :         _mm_storeu_si128(
    7155             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
    7156             :             tmp2_2);
    7157             :         _mm_storeu_si128(
    7158             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
    7159             :             tmp2_3);
    7160             :     }
    7161             : #if defined(__clang__)
    7162             : #pragma clang loop vectorize(disable)
    7163             : #endif
    7164             :     for (; i < nIters; ++i)
    7165             :     {
    7166             :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    7167             :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    7168             :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    7169             :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    7170             :     }
    7171             : }
    7172             : 
    7173             : #else
    7174             : 
    7175             : #if defined(__GNUC__) && !defined(__clang__)
    7176             : __attribute__((optimize("tree-vectorize")))
    7177             : #endif
    7178             : #if defined(__GNUC__)
    7179             : __attribute__((noinline))
    7180             : #endif
    7181             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7182             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    7183             : #pragma clang diagnostic push
    7184             : #pragma clang diagnostic ignored "-Wpass-failed"
    7185             : #endif
    7186          30 : static void GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    7187             :                                 uint8_t *CPL_RESTRICT pDst, size_t nIters)
    7188             : {
    7189             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7190             : #pragma clang loop vectorize(enable)
    7191             : #endif
    7192    49620700 :     for (size_t i = 0; i < nIters; ++i)
    7193             :     {
    7194    49620600 :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    7195    49620600 :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    7196    49620600 :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    7197    49620600 :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    7198             :     }
    7199          30 : }
    7200             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    7201             : #pragma clang diagnostic pop
    7202             : #endif
    7203             : 
    7204             : #endif
    7205             : 
    7206             : /************************************************************************/
    7207             : /*                          GDALTranspose2D()                           */
    7208             : /************************************************************************/
    7209             : 
    7210             : /**
    7211             :  * Transpose a 2D array in a efficient (cache-oblivious) way.
    7212             :  *
    7213             :  * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
    7214             :  * @param eSrcType Data type of pSrc.
    7215             :  * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
    7216             :  * @param eDstType Data type of pDst.
    7217             :  * @param nSrcWidth Width of pSrc array.
    7218             :  * @param nSrcHeight Height of pSrc array.
    7219             :  * @since GDAL 3.11
    7220             :  */
    7221             : 
    7222         365 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
    7223             :                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
    7224             : {
    7225         365 :     if (eSrcType == eDstType && (eSrcType == GDT_UInt8 || eSrcType == GDT_Int8))
    7226             :     {
    7227          70 :         if (nSrcHeight == 2)
    7228             :         {
    7229           9 :             GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
    7230             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    7231           9 :             return;
    7232             :         }
    7233          61 :         if (nSrcHeight == 4)
    7234             :         {
    7235          30 :             GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
    7236             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    7237          30 :             return;
    7238             :         }
    7239             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
    7240             :      (defined(__x86_64) || defined(_M_X64)))
    7241          31 :         if (CPLHaveRuntimeSSSE3())
    7242             :         {
    7243          31 :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    7244             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    7245             :                                        nSrcHeight);
    7246          31 :             return;
    7247             :         }
    7248             : #elif defined(USE_NEON_OPTIMIZATIONS)
    7249             :         {
    7250             :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    7251             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    7252             :                                        nSrcHeight);
    7253             :             return;
    7254             :         }
    7255             : #endif
    7256             :     }
    7257             : 
    7258             : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
    7259             :     GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
    7260             :         pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
    7261             : 
    7262             :     // clang-format off
    7263         295 :     switch (eDstType)
    7264             :     {
    7265          15 :         case GDT_UInt8:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
    7266          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
    7267          33 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
    7268          20 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
    7269          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
    7270          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
    7271          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
    7272          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
    7273          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
    7274          19 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
    7275          25 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
    7276          16 :         case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
    7277          16 :         case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
    7278          16 :         case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
    7279          16 :         case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
    7280          16 :         case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
    7281           0 :         case GDT_Unknown:
    7282             :         case GDT_TypeCount:
    7283           0 :             break;
    7284             :     }
    7285             :         // clang-format on
    7286             : 
    7287             : #undef CALL_GDALTranspose2D_internal
    7288             : }
    7289             : 
    7290             : /************************************************************************/
    7291             : /*                     ExtractBitAndConvertTo255()                      */
    7292             : /************************************************************************/
    7293             : 
    7294             : #if defined(__GNUC__) || defined(_MSC_VER)
    7295             : // Signedness of char implementation dependent, so be explicit.
    7296             : // Assumes 2-complement integer types and sign extension of right shifting
    7297             : // GCC guarantees such:
    7298             : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
    7299      143686 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    7300             : {
    7301      143686 :     return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
    7302      143686 :                               7);
    7303             : }
    7304             : #else
    7305             : // Portable way
    7306             : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    7307             : {
    7308             :     return (byVal & (1 << nBit)) ? 255 : 0;
    7309             : }
    7310             : #endif
    7311             : 
    7312             : /************************************************************************/
    7313             : /*                  ExpandEightPackedBitsToByteAt255()                  */
    7314             : /************************************************************************/
    7315             : 
    7316       17825 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
    7317             :                                                     GByte abyOutput[8])
    7318             : {
    7319       17825 :     abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
    7320       17825 :     abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
    7321       17825 :     abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
    7322       17825 :     abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
    7323       17825 :     abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
    7324       17825 :     abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
    7325       17825 :     abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
    7326       17825 :     abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
    7327       17825 : }
    7328             : 
    7329             : /************************************************************************/
    7330             : /*                 GDALExpandPackedBitsToByteAt0Or255()                 */
    7331             : /************************************************************************/
    7332             : 
    7333             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7334             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7335             :   at 1 to a byte at 255.
    7336             : 
    7337             :  The function does (in a possibly more optimized way) the following:
    7338             :  \code{.cpp}
    7339             :  for (size_t i = 0; i < nInputBits; ++i )
    7340             :  {
    7341             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
    7342             :  }
    7343             :  \endcode
    7344             : 
    7345             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7346             :  @param pabyOutput Output array of nInputBits bytes.
    7347             :  @param nInputBits Number of valid bits in pabyInput.
    7348             : 
    7349             :  @since 3.11
    7350             : */
    7351             : 
    7352       46905 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
    7353             :                                         GByte *CPL_RESTRICT pabyOutput,
    7354             :                                         size_t nInputBits)
    7355             : {
    7356       46905 :     const size_t nInputWholeBytes = nInputBits / 8;
    7357       46905 :     size_t iByte = 0;
    7358             : 
    7359             : #ifdef HAVE_SSE2
    7360             :     // Mask to isolate each bit
    7361       46905 :     const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
    7362             :                                           8, 16, 32, 64, -128);
    7363       46905 :     const __m128i zero = _mm_setzero_si128();
    7364       46905 :     const __m128i all_ones = _mm_set1_epi8(-1);
    7365             : #ifdef __SSSE3__
    7366             :     const __m128i dispatch_two_bytes =
    7367             :         _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
    7368             : #endif
    7369       46905 :     constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
    7370      138950 :     for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
    7371             :     {
    7372       92045 :         __m128i reg_ori = _mm_loadu_si128(
    7373       92045 :             reinterpret_cast<const __m128i *>(pabyInput + iByte));
    7374             : 
    7375       92045 :         constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
    7376      828405 :         for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
    7377             :         {
    7378             :             // Given reg_ori = (A, B, ... 14 other bytes ...),
    7379             :             // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
    7380             : #ifdef __SSSE3__
    7381             :             __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
    7382             : #else
    7383      736360 :             __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
    7384      736360 :             reg = _mm_unpacklo_epi16(reg, reg);
    7385      736360 :             reg = _mm_unpacklo_epi32(reg, reg);
    7386             : #endif
    7387             : 
    7388             :             // Test if bits of interest are set
    7389      736360 :             reg = _mm_and_si128(reg, bit_mask);
    7390             : 
    7391             :             // Now test if those bits are set, by comparing to zero. So the
    7392             :             // result will be that bytes where bits are set will be at 0, and
    7393             :             // ones where they are cleared will be at 0xFF. So the inverse of
    7394             :             // the end result we want!
    7395      736360 :             reg = _mm_cmpeq_epi8(reg, zero);
    7396             : 
    7397             :             // Invert the result
    7398      736360 :             reg = _mm_andnot_si128(reg, all_ones);
    7399             : 
    7400             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
    7401             : 
    7402      736360 :             pabyOutput += SSE_REG_SIZE;
    7403             : 
    7404             :             // Right-shift of 2 bytes
    7405      736360 :             reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
    7406             :         }
    7407             :     }
    7408             : 
    7409             : #endif  // HAVE_SSE2
    7410             : 
    7411       64730 :     for (; iByte < nInputWholeBytes; ++iByte)
    7412             :     {
    7413       17825 :         ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
    7414       17825 :         pabyOutput += 8;
    7415             :     }
    7416       47991 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7417             :     {
    7418        1086 :         *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
    7419        1086 :         ++pabyOutput;
    7420             :     }
    7421       46905 : }
    7422             : 
    7423             : /************************************************************************/
    7424             : /*                   ExpandEightPackedBitsToByteAt1()                   */
    7425             : /************************************************************************/
    7426             : 
    7427      136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
    7428             :                                                   GByte abyOutput[8])
    7429             : {
    7430      136113 :     abyOutput[0] = (byVal >> 7) & 0x1;
    7431      136113 :     abyOutput[1] = (byVal >> 6) & 0x1;
    7432      136113 :     abyOutput[2] = (byVal >> 5) & 0x1;
    7433      136113 :     abyOutput[3] = (byVal >> 4) & 0x1;
    7434      136113 :     abyOutput[4] = (byVal >> 3) & 0x1;
    7435      136113 :     abyOutput[5] = (byVal >> 2) & 0x1;
    7436      136113 :     abyOutput[6] = (byVal >> 1) & 0x1;
    7437      136113 :     abyOutput[7] = (byVal >> 0) & 0x1;
    7438      136113 : }
    7439             : 
    7440             : /************************************************************************/
    7441             : /*                  GDALExpandPackedBitsToByteAt0Or1()                  */
    7442             : /************************************************************************/
    7443             : 
    7444             : /** Expand packed-bits (ordered from most-significant bit to least one)
    7445             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    7446             :   at 1 to a byte at 1.
    7447             : 
    7448             :  The function does (in a possibly more optimized way) the following:
    7449             :  \code{.cpp}
    7450             :  for (size_t i = 0; i < nInputBits; ++i )
    7451             :  {
    7452             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
    7453             :  }
    7454             :  \endcode
    7455             : 
    7456             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    7457             :  @param pabyOutput Output array of nInputBits bytes.
    7458             :  @param nInputBits Number of valid bits in pabyInput.
    7459             : 
    7460             :  @since 3.11
    7461             : */
    7462             : 
    7463        7033 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
    7464             :                                       GByte *CPL_RESTRICT pabyOutput,
    7465             :                                       size_t nInputBits)
    7466             : {
    7467        7033 :     const size_t nInputWholeBytes = nInputBits / 8;
    7468        7033 :     size_t iByte = 0;
    7469      143146 :     for (; iByte < nInputWholeBytes; ++iByte)
    7470             :     {
    7471      136113 :         ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
    7472      136113 :         pabyOutput += 8;
    7473             :     }
    7474       18886 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    7475             :     {
    7476       11853 :         *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
    7477       11853 :         ++pabyOutput;
    7478             :     }
    7479        7033 : }

Generated by: LCOV version 1.14