LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2410 2635 91.5 %
Date: 2025-05-31 00:00:17 Functions: 684 727 94.1 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_float.h"
      36             : #include "cpl_progress.h"
      37             : #include "cpl_string.h"
      38             : #include "cpl_vsi.h"
      39             : #include "gdal_priv_templates.hpp"
      40             : #include "gdal_vrt.h"
      41             : #include "gdalwarper.h"
      42             : #include "memdataset.h"
      43             : #include "vrtdataset.h"
      44             : 
      45             : #if defined(__x86_64) || defined(_M_X64)
      46             : #include <emmintrin.h>
      47             : #define HAVE_SSE2
      48             : #elif defined(USE_NEON_OPTIMIZATIONS)
      49             : #include "include_sse2neon.h"
      50             : #define HAVE_SSE2
      51             : #endif
      52             : 
      53             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      54             : #include "rasterio_ssse3.h"
      55             : #ifdef __SSSE3__
      56             : #include <tmmintrin.h>
      57             : #endif
      58             : #endif
      59             : 
      60             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      61             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      62             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      63             : 
      64             : /************************************************************************/
      65             : /*                    DownsamplingIntegerXFactor()                      */
      66             : /************************************************************************/
      67             : 
      68             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      69      695677 : static bool DownsamplingIntegerXFactor(
      70             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      71             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      72             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      73             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      74             : {
      75      695677 :     const int nBandDataSize =
      76             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      77      695677 :     int nOuterLoopIters = nBufXSize - 1;
      78      695677 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      79             :     const GByte *CPL_RESTRICT pabySrcData;
      80      695677 :     int nEndBlockX = nBlockXSize + nStartBlockX;
      81             : 
      82      695677 :     if (iSrcX < nEndBlockX)
      83             :     {
      84      294999 :         CPLAssert(poBlock);
      85      294999 :         goto no_reload_block;
      86             :     }
      87      400678 :     goto reload_block;
      88             : 
      89             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
      90             :     // nRasterXSize - 1
      91     1264772 :     while (--nOuterLoopIters >= 1)
      92             :     {
      93      201834 :         iSrcX += nSrcXInc;
      94      201834 :         pabySrcData += nIncSrcOffset;
      95      201834 :         pabyDstData += nPixelSpace;
      96             : 
      97             :         /* --------------------------------------------------------------------
      98             :          */
      99             :         /*      Ensure we have the appropriate block loaded. */
     100             :         /* --------------------------------------------------------------------
     101             :          */
     102      201834 :         if (iSrcX >= nEndBlockX)
     103             :         {
     104      201834 :         reload_block:
     105             :         {
     106      615102 :             const int nLBlockX = iSrcX / nBlockXSize;
     107      615102 :             nStartBlockX = nLBlockX * nBlockXSize;
     108      615102 :             nEndBlockX = nStartBlockX + nBlockXSize;
     109             : 
     110      615102 :             if (poBlock != nullptr)
     111      341314 :                 poBlock->DropLock();
     112             : 
     113      615102 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     114      615102 :             if (poBlock == nullptr)
     115             :             {
     116           1 :                 return false;
     117             :             }
     118             :         }
     119             : 
     120      615101 :         no_reload_block:
     121             :             const GByte *pabySrcBlock =
     122     1264772 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     123     1264772 :             GPtrDiff_t iSrcOffset =
     124     1264772 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     125     1264772 :             pabySrcData = pabySrcBlock + iSrcOffset;
     126             :         }
     127             : 
     128             :         /* --------------------------------------------------------------------
     129             :          */
     130             :         /*      Copy the maximum run of pixels. */
     131             :         /* --------------------------------------------------------------------
     132             :          */
     133             : 
     134     1264772 :         const int nIters = std::min(
     135     1264772 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     136             :         if (bSameDataType)
     137             :         {
     138     1264367 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     139     1264367 :             if (nIters > 1)
     140             :             {
     141             :                 if (DATA_TYPE_SIZE == 1)
     142             :                 {
     143      326246 :                     pabySrcData += nIncSrcOffset;
     144      326246 :                     pabyDstData += nPixelSpace;
     145      326246 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     146      326246 :                                      nPixelSpace, nIters - 1);
     147      326246 :                     pabySrcData +=
     148      326246 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     149      326246 :                     pabyDstData +=
     150      326246 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     151             :                 }
     152             :                 else
     153             :                 {
     154     4395158 :                     for (int i = 0; i < nIters - 1; i++)
     155             :                     {
     156     4197064 :                         pabySrcData += nIncSrcOffset;
     157     4197064 :                         pabyDstData += nPixelSpace;
     158     4197064 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     159             :                     }
     160             :                 }
     161      524340 :                 iSrcX += nSrcXInc * (nIters - 1);
     162      524340 :                 nOuterLoopIters -= nIters - 1;
     163             :             }
     164             :         }
     165             :         else
     166             :         {
     167             :             // Type to type conversion ...
     168         405 :             GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     169         405 :                             eBufType, nPixelSpace, std::max(1, nIters));
     170         405 :             if (nIters > 1)
     171             :             {
     172         198 :                 pabySrcData +=
     173         198 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     174         198 :                 pabyDstData +=
     175         198 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     176         198 :                 iSrcX += nSrcXInc * (nIters - 1);
     177         198 :                 nOuterLoopIters -= nIters - 1;
     178             :             }
     179             :         }
     180             :     }
     181             : 
     182             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     183     1062938 :     if (nOuterLoopIters == 0)
     184             :     {
     185      367262 :         const int nRasterXSize = poBand->GetXSize();
     186      367262 :         iSrcX =
     187      734524 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     188      367262 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     189      367262 :         pabyDstData += nPixelSpace;
     190      367262 :         if (iSrcX < nEndBlockX)
     191             :         {
     192      354672 :             goto no_reload_block;
     193             :         }
     194       12590 :         goto reload_block;
     195             :     }
     196      695676 :     return true;
     197             : }
     198             : 
     199             : /************************************************************************/
     200             : /*                             IRasterIO()                              */
     201             : /*                                                                      */
     202             : /*      Default internal implementation of RasterIO() ... utilizes      */
     203             : /*      the Block access methods to satisfy the request.  This would    */
     204             : /*      normally only be overridden by formats with overviews.          */
     205             : /************************************************************************/
     206             : 
     207     6019240 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     208             :                                  int nXSize, int nYSize, void *pData,
     209             :                                  int nBufXSize, int nBufYSize,
     210             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     211             :                                  GSpacing nLineSpace,
     212             :                                  GDALRasterIOExtraArg *psExtraArg)
     213             : 
     214             : {
     215     6019240 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     216             :     {
     217           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     218             :                  "An error occurred while writing a dirty block "
     219             :                  "from GDALRasterBand::IRasterIO");
     220           0 :         CPLErr eErr = eFlushBlockErr;
     221           0 :         eFlushBlockErr = CE_None;
     222           0 :         return eErr;
     223             :     }
     224     6019240 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     225             :     {
     226          68 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     227           0 :         return CE_Failure;
     228             :     }
     229             : 
     230     6019170 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     231     6019170 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     232     6019180 :     GByte dummyBlock[2] = {0, 0};
     233     6019180 :     GByte *pabySrcBlock =
     234             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     235     6019180 :     GDALRasterBlock *poBlock = nullptr;
     236     6019180 :     const bool bUseIntegerRequestCoords =
     237     6264990 :         (!psExtraArg->bFloatingPointWindowValidity ||
     238      245815 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     239      222431 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     240             : 
     241             :     /* ==================================================================== */
     242             :     /*      A common case is the data requested with the destination        */
     243             :     /*      is packed, and the block width is the raster width.             */
     244             :     /* ==================================================================== */
     245     5941490 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     246     3096660 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     247    11960700 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     248             :     {
     249     2984890 :         CPLErr eErr = CE_None;
     250     2984890 :         int nLBlockY = -1;
     251             : 
     252     8955340 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     253             :         {
     254     5971420 :             const int iSrcY = iBufYOff + nYOff;
     255             : 
     256     5971420 :             if (iSrcY < nLBlockY * nBlockYSize ||
     257     5970990 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     258             :             {
     259     3246140 :                 nLBlockY = iSrcY / nBlockYSize;
     260     3246140 :                 bool bJustInitialize =
     261      295352 :                     eRWFlag == GF_Write && nXOff == 0 &&
     262     3598550 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     263       57057 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     264             : 
     265             :                 // Is this a partial tile at right and/or bottom edges of
     266             :                 // the raster, and that is going to be completely written?
     267             :                 // If so, do not load it from storage, but zero it so that
     268             :                 // the content outsize of the validity area is initialized.
     269     3246140 :                 bool bMemZeroBuffer = false;
     270      295352 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     271       23813 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     272     3541580 :                     nYOff + nYSize == GetYSize() &&
     273          89 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     274             :                 {
     275          89 :                     bJustInitialize = true;
     276          89 :                     bMemZeroBuffer = true;
     277             :                 }
     278             : 
     279     3246140 :                 if (poBlock)
     280      261243 :                     poBlock->DropLock();
     281             : 
     282     3246140 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     283     3246130 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     284     3246270 :                 if (poBlock == nullptr)
     285             :                 {
     286        1070 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     287             :                         nullptr)
     288             :                     {
     289           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     290             :                                  "GetBlockRef failed at X block offset %d, "
     291             :                                  "Y block offset %d%s",
     292             :                                  0, nLBlockY,
     293           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     294           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     295             :                                      : "");
     296             :                     }
     297        1070 :                     eErr = CE_Failure;
     298        1070 :                     break;
     299             :                 }
     300             : 
     301     3245200 :                 if (eRWFlag == GF_Write)
     302      295352 :                     poBlock->MarkDirty();
     303             : 
     304     3245200 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     305     3245160 :                 if (bMemZeroBuffer)
     306             :                 {
     307          89 :                     memset(pabySrcBlock, 0,
     308          89 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     309          89 :                                nBlockXSize * nBlockYSize);
     310             :                 }
     311             :             }
     312             : 
     313     5970430 :             const auto nSrcByteOffset =
     314     5970430 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     315     5970430 :                      nBlockXSize +
     316     5970430 :                  nXOff) *
     317     5970430 :                 nBandDataSize;
     318             : 
     319     5970430 :             if (eDataType == eBufType)
     320             :             {
     321     2335410 :                 if (eRWFlag == GF_Read)
     322     1864910 :                     memcpy(static_cast<GByte *>(pData) +
     323     1864910 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     324     1864910 :                            pabySrcBlock + nSrcByteOffset,
     325             :                            static_cast<size_t>(nLineSpace));
     326             :                 else
     327      470500 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     328      470500 :                            static_cast<GByte *>(pData) +
     329      470500 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     330             :                            static_cast<size_t>(nLineSpace));
     331             :             }
     332             :             else
     333             :             {
     334             :                 // Type to type conversion.
     335     3635020 :                 if (eRWFlag == GF_Read)
     336     3613650 :                     GDALCopyWords64(
     337     3613650 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     338             :                         static_cast<GByte *>(pData) +
     339     3613650 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     340             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     341             :                 else
     342       21371 :                     GDALCopyWords64(static_cast<GByte *>(pData) +
     343       21371 :                                         static_cast<GPtrDiff_t>(iBufYOff) *
     344             :                                             nLineSpace,
     345             :                                     eBufType, static_cast<int>(nPixelSpace),
     346       21371 :                                     pabySrcBlock + nSrcByteOffset, eDataType,
     347             :                                     nBandDataSize, nBufXSize);
     348             :             }
     349             : 
     350     6043290 :             if (psExtraArg->pfnProgress != nullptr &&
     351       72839 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     352             :                                          psExtraArg->pProgressData))
     353             :             {
     354           5 :                 eErr = CE_Failure;
     355           5 :                 break;
     356             :             }
     357             :         }
     358             : 
     359     2985000 :         if (poBlock)
     360     2983920 :             poBlock->DropLock();
     361             : 
     362     2984970 :         return eErr;
     363             :     }
     364             : 
     365             :     /* ==================================================================== */
     366             :     /*      Do we have overviews that would be appropriate to satisfy       */
     367             :     /*      this request?                                                   */
     368             :     /* ==================================================================== */
     369     3034300 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     370             :         eRWFlag == GF_Read)
     371             :     {
     372             :         GDALRasterIOExtraArg sExtraArg;
     373        2902 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     374             : 
     375             :         const int nOverview =
     376        2902 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     377             :                                           nBufXSize, nBufYSize, &sExtraArg);
     378        2902 :         if (nOverview >= 0)
     379             :         {
     380        2827 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     381        2827 :             if (poOverviewBand == nullptr)
     382        2827 :                 return CE_Failure;
     383             : 
     384        2827 :             return poOverviewBand->RasterIO(
     385             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     386        2827 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     387             :         }
     388             :     }
     389             : 
     390      842693 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     391           6 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     392     3874140 :         nLineSpace == nPixelSpace * nBufXSize &&
     393           6 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     394             :     {
     395           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     396           0 :         return CE_None;
     397             :     }
     398             : 
     399             :     /* ==================================================================== */
     400             :     /*      The second case when we don't need subsample data but likely    */
     401             :     /*      need data type conversion.                                      */
     402             :     /* ==================================================================== */
     403     3031440 :     if (  // nPixelSpace == nBufDataSize &&
     404     3031440 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     405             :     {
     406             : #if DEBUG_VERBOSE
     407             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     408             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     409             : #endif
     410             : 
     411             :         /* --------------------------------------------------------------------
     412             :          */
     413             :         /*      Loop over buffer computing source locations. */
     414             :         /* --------------------------------------------------------------------
     415             :          */
     416             :         // Calculate starting values out of loop
     417     2466250 :         const int nLBlockXStart = nXOff / nBlockXSize;
     418     2466250 :         const int nXSpanEnd = nBufXSize + nXOff;
     419             : 
     420     2466250 :         int nYInc = 0;
     421     4971200 :         for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
     422     2504950 :              iBufYOff += nYInc, iSrcY += nYInc)
     423             :         {
     424     2505020 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     425             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     426     2505020 :             int nLBlockY = iSrcY / nBlockYSize;
     427     2505020 :             int nLBlockX = nLBlockXStart;
     428     2505020 :             int iSrcX = nXOff;
     429     5228480 :             while (iSrcX < nXSpanEnd)
     430             :             {
     431     2723510 :                 int nXSpan = nLBlockX * nBlockXSize;
     432     2723510 :                 if (nXSpan < INT_MAX - nBlockXSize)
     433     2723510 :                     nXSpan += nBlockXSize;
     434             :                 else
     435           0 :                     nXSpan = INT_MAX;
     436     2723510 :                 const int nXRight = nXSpan;
     437     2723510 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     438     2723510 :                 const size_t nXSpanSize =
     439     2723510 :                     nXSpan * static_cast<size_t>(nPixelSpace);
     440             : 
     441     2723510 :                 bool bJustInitialize =
     442     2042150 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     443       37242 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     444     4791240 :                     nXOff <= nLBlockX * nBlockXSize &&
     445       25577 :                     nXOff + nXSize >= nXRight;
     446             : 
     447             :                 // Is this a partial tile at right and/or bottom edges of
     448             :                 // the raster, and that is going to be completely written?
     449             :                 // If so, do not load it from storage, but zero it so that
     450             :                 // the content outsize of the validity area is initialized.
     451     2723510 :                 bool bMemZeroBuffer = false;
     452     2042150 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     453     2017810 :                     nXOff <= nLBlockX * nBlockXSize &&
     454     2016180 :                     nYOff <= nLBlockY * nBlockYSize &&
     455       12139 :                     (nXOff + nXSize >= nXRight ||
     456             :                      // cppcheck-suppress knownConditionTrueFalse
     457     4768360 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     458       11959 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     459       10737 :                      (nYOff + nYSize == GetYSize() &&
     460        1949 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     461             :                 {
     462        3171 :                     bJustInitialize = true;
     463        3171 :                     bMemZeroBuffer = true;
     464             :                 }
     465             : 
     466             :                 /* --------------------------------------------------------------------
     467             :                  */
     468             :                 /*      Ensure we have the appropriate block loaded. */
     469             :                 /* --------------------------------------------------------------------
     470             :                  */
     471     2723510 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     472     2723540 :                 poBlock =
     473     2723520 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     474     2723540 :                 if (!poBlock)
     475             :                 {
     476          71 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     477             :                         nullptr)
     478             :                     {
     479           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     480             :                                  "GetBlockRef failed at X block offset %d, "
     481             :                                  "Y block offset %d%s",
     482             :                                  nLBlockX, nLBlockY,
     483           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     484           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     485             :                                      : "");
     486             :                     }
     487          71 :                     return (CE_Failure);
     488             :                 }
     489             : 
     490     2723470 :                 if (eRWFlag == GF_Write)
     491     2042150 :                     poBlock->MarkDirty();
     492             : 
     493     2723470 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     494     2723470 :                 if (bMemZeroBuffer)
     495             :                 {
     496        3171 :                     memset(pabySrcBlock, 0,
     497        3171 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     498        3171 :                                nBlockXSize * nBlockYSize);
     499             :                 }
     500             :                 /* --------------------------------------------------------------------
     501             :                  */
     502             :                 /*      Copy over this chunk of data. */
     503             :                 /* --------------------------------------------------------------------
     504             :                  */
     505     2723470 :                 GPtrDiff_t iSrcOffset =
     506     2723470 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     507     2723470 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     508     2723470 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     509     2723470 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     510     2723470 :                          nBlockXSize) *
     511     2723470 :                     nBandDataSize;
     512             :                 // Fill up as many rows as possible for the loaded block.
     513     5446930 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     514     2723470 :                                           nBufYSize - iBufYOff);
     515    58612700 :                 for (int k = 0; k < kmax; k++)
     516             :                 {
     517    55889300 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     518             :                     {
     519    51932700 :                         if (eRWFlag == GF_Read)
     520    47489900 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     521    47489900 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     522    47489900 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     523             :                         else
     524     4442790 :                             memcpy(pabySrcBlock + iSrcOffset,
     525     4442790 :                                    static_cast<GByte *>(pData) + iBufOffset +
     526     4442790 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     527             :                                    nXSpanSize);
     528             :                     }
     529             :                     else
     530             :                     {
     531             :                         /* type to type conversion */
     532     3956590 :                         if (eRWFlag == GF_Read)
     533     3897200 :                             GDALCopyWords64(
     534     3897200 :                                 pabySrcBlock + iSrcOffset, eDataType,
     535             :                                 nBandDataSize,
     536     3897200 :                                 static_cast<GByte *>(pData) + iBufOffset +
     537     3897200 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     538             :                                 eBufType, static_cast<int>(nPixelSpace),
     539             :                                 nXSpan);
     540             :                         else
     541       59398 :                             GDALCopyWords64(
     542       59398 :                                 static_cast<GByte *>(pData) + iBufOffset +
     543       59398 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     544             :                                 eBufType, static_cast<int>(nPixelSpace),
     545       59398 :                                 pabySrcBlock + iSrcOffset, eDataType,
     546             :                                 nBandDataSize, nXSpan);
     547             :                     }
     548             : 
     549    55889200 :                     iSrcOffset +=
     550    55889200 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     551             :                 }
     552             : 
     553             :                 iBufOffset =
     554     2723460 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     555     2723460 :                 nLBlockX++;
     556     2723460 :                 iSrcX += nXSpan;
     557             : 
     558     2723460 :                 poBlock->DropLock();
     559     2723460 :                 poBlock = nullptr;
     560             :             }
     561             : 
     562             :             /* Compute the increment to go on a block boundary */
     563     2504970 :             nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     564             : 
     565     2506820 :             if (psExtraArg->pfnProgress != nullptr &&
     566        1849 :                 !psExtraArg->pfnProgress(
     567     2506820 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     568             :                     psExtraArg->pProgressData))
     569             :             {
     570          12 :                 return CE_Failure;
     571             :             }
     572             :         }
     573             : 
     574     2466190 :         return CE_None;
     575             :     }
     576             : 
     577             :     /* ==================================================================== */
     578             :     /*      Loop reading required source blocks to satisfy output           */
     579             :     /*      request.  This is the most general implementation.              */
     580             :     /* ==================================================================== */
     581             : 
     582      565185 :     double dfXOff = nXOff;
     583      565185 :     double dfYOff = nYOff;
     584      565185 :     double dfXSize = nXSize;
     585      565185 :     double dfYSize = nYSize;
     586      565185 :     if (psExtraArg->bFloatingPointWindowValidity)
     587             :     {
     588      230468 :         dfXOff = psExtraArg->dfXOff;
     589      230468 :         dfYOff = psExtraArg->dfYOff;
     590      230468 :         dfXSize = psExtraArg->dfXSize;
     591      230468 :         dfYSize = psExtraArg->dfYSize;
     592             :     }
     593             : 
     594             :     /* -------------------------------------------------------------------- */
     595             :     /*      Compute stepping increment.                                     */
     596             :     /* -------------------------------------------------------------------- */
     597      565185 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     598      565185 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     599      565185 :     CPLErr eErr = CE_None;
     600             : 
     601      565185 :     if (eRWFlag == GF_Write)
     602             :     {
     603             :         /* --------------------------------------------------------------------
     604             :          */
     605             :         /*    Write case */
     606             :         /*    Loop over raster window computing source locations in the buffer.
     607             :          */
     608             :         /* --------------------------------------------------------------------
     609             :          */
     610      166651 :         GByte *pabyDstBlock = nullptr;
     611      166651 :         int nLBlockX = -1;
     612      166651 :         int nLBlockY = -1;
     613             : 
     614     1259990 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     615             :         {
     616     1093340 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     617             : 
     618    12384000 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     619             :             {
     620    11290600 :                 const int iBufXOff =
     621    11290600 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     622    11290600 :                 GPtrDiff_t iBufOffset =
     623    11290600 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     624             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     625    11290600 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     626             : 
     627             :                 // FIXME: this code likely doesn't work if the dirty block gets
     628             :                 // flushed to disk before being completely written.
     629             :                 // In the meantime, bJustInitialize should probably be set to
     630             :                 // FALSE even if it is not ideal performance wise, and for
     631             :                 // lossy compression.
     632             : 
     633             :                 /* --------------------------------------------------------------------
     634             :                  */
     635             :                 /*      Ensure we have the appropriate block loaded. */
     636             :                 /* --------------------------------------------------------------------
     637             :                  */
     638    11290600 :                 if (iDstX < nLBlockX * nBlockXSize ||
     639    11041300 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     640    10584600 :                     iDstY < nLBlockY * nBlockYSize ||
     641    10584600 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     642             :                 {
     643      738682 :                     nLBlockX = iDstX / nBlockXSize;
     644      738682 :                     nLBlockY = iDstY / nBlockYSize;
     645             : 
     646      738682 :                     const bool bJustInitialize =
     647     1065950 :                         nYOff <= nLBlockY * nBlockYSize &&
     648      327271 :                         nYOff + nYSize - nBlockYSize >=
     649      327271 :                             nLBlockY * nBlockYSize &&
     650     1116260 :                         nXOff <= nLBlockX * nBlockXSize &&
     651       50305 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     652             :                     /*bool bMemZeroBuffer = FALSE;
     653             :                     if( !bJustInitialize &&
     654             :                         nXOff <= nLBlockX * nBlockXSize &&
     655             :                         nYOff <= nLBlockY * nBlockYSize &&
     656             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     657             :                          (nXOff + nXSize == GetXSize() &&
     658             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     659             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     660             :                          (nYOff + nYSize == GetYSize() &&
     661             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     662             :                     {
     663             :                         bJustInitialize = TRUE;
     664             :                         bMemZeroBuffer = TRUE;
     665             :                     }*/
     666      738682 :                     if (poBlock != nullptr)
     667      572031 :                         poBlock->DropLock();
     668             : 
     669      738682 :                     poBlock =
     670      738682 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     671      738682 :                     if (poBlock == nullptr)
     672             :                     {
     673           0 :                         return (CE_Failure);
     674             :                     }
     675             : 
     676      738682 :                     poBlock->MarkDirty();
     677             : 
     678      738682 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     679             :                     /*if( bMemZeroBuffer )
     680             :                     {
     681             :                         memset(pabyDstBlock, 0,
     682             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     683             :                     * nBlockYSize);
     684             :                     }*/
     685             :                 }
     686             : 
     687             :                 // To make Coverity happy. Should not happen by design.
     688    11290600 :                 if (pabyDstBlock == nullptr)
     689             :                 {
     690           0 :                     CPLAssert(false);
     691             :                     eErr = CE_Failure;
     692             :                     break;
     693             :                 }
     694             : 
     695             :                 /* --------------------------------------------------------------------
     696             :                  */
     697             :                 /*      Copy over this pixel of data. */
     698             :                 /* --------------------------------------------------------------------
     699             :                  */
     700    11290600 :                 GPtrDiff_t iDstOffset =
     701    11290600 :                     (static_cast<GPtrDiff_t>(iDstX) -
     702    11290600 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     703    11290600 :                      (static_cast<GPtrDiff_t>(iDstY) -
     704    11290600 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     705    11290600 :                          nBlockXSize) *
     706    11290600 :                     nBandDataSize;
     707             : 
     708    11290600 :                 if (eDataType == eBufType)
     709             :                 {
     710    11287500 :                     memcpy(pabyDstBlock + iDstOffset,
     711    11287500 :                            static_cast<GByte *>(pData) + iBufOffset,
     712             :                            nBandDataSize);
     713             :                 }
     714             :                 else
     715             :                 {
     716             :                     /* type to type conversion ... ouch, this is expensive way
     717             :                     of handling single words */
     718        3096 :                     GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
     719        3096 :                                     eBufType, 0, pabyDstBlock + iDstOffset,
     720             :                                     eDataType, 0, 1);
     721             :                 }
     722             :             }
     723             : 
     724     1093340 :             if (psExtraArg->pfnProgress != nullptr &&
     725           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     726             :                                          psExtraArg->pProgressData))
     727             :             {
     728           0 :                 eErr = CE_Failure;
     729           0 :                 break;
     730             :             }
     731             :         }
     732             :     }
     733             :     else
     734             :     {
     735      398534 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     736             :         {
     737        8601 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     738        2513 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     739        2511 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     740        6093 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     741        2925 :                 GetColorTable() != nullptr)
     742             :             {
     743           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     744             :                          "Resampling method not supported on paletted band. "
     745             :                          "Falling back to nearest neighbour");
     746             :             }
     747        3047 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     748           3 :                      GDALDataTypeIsComplex(eDataType))
     749             :             {
     750           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     751             :                          "Resampling method not supported on complex data type "
     752             :                          "band. Falling back to nearest neighbour");
     753             :             }
     754             :             else
     755             :             {
     756        3044 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     757             :                                          pData, nBufXSize, nBufYSize, eBufType,
     758        3045 :                                          nPixelSpace, nLineSpace, psExtraArg);
     759             :             }
     760             :         }
     761             : 
     762      395483 :         int nLimitBlockY = 0;
     763      395483 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     764      395483 :         int nStartBlockX = -nBlockXSize;
     765      395483 :         const double EPS = 1e-10;
     766      395483 :         int nLBlockY = -1;
     767      395483 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     768      395483 :         const bool bIntegerXFactor =
     769      372806 :             bUseIntegerRequestCoords &&
     770      669271 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     771      273788 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     772             : 
     773             :         /* --------------------------------------------------------------------
     774             :          */
     775             :         /*      Read case */
     776             :         /*      Loop over buffer computing source locations. */
     777             :         /* --------------------------------------------------------------------
     778             :          */
     779     2451410 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     780             :         {
     781             :             // Add small epsilon to avoid some numeric precision issues.
     782     2055940 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     783     2055940 :             const int iSrcY = static_cast<int>(std::min(
     784     2055940 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     785             : 
     786     2055940 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     787             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     788             : 
     789     2055940 :             if (iSrcY >= nLimitBlockY)
     790             :             {
     791      433624 :                 nLBlockY = iSrcY / nBlockYSize;
     792      433624 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     793      433624 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     794      433624 :                     nLimitBlockY += nBlockYSize;
     795             :                 else
     796           0 :                     nLimitBlockY = INT_MAX;
     797             :                 // Make sure a new block is loaded.
     798      433624 :                 nStartBlockX = -nBlockXSize;
     799             :             }
     800     1622320 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     801             :             {
     802             :                 // Make sure a new block is loaded.
     803      441987 :                 nStartBlockX = -nBlockXSize;
     804             :             }
     805             : 
     806     2055940 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     807     2055940 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     808             : 
     809     2055940 :             if (bIntegerXFactor)
     810             :             {
     811      695677 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     812      695677 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     813      695677 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     814      695677 :                 bool bRet = false;
     815      695677 :                 if (bByteCopy)
     816             :                 {
     817      585768 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     818             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     819             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_Byte,
     820             :                         GDT_Byte, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     821             :                 }
     822      109909 :                 else if (eDataType == eBufType)
     823             :                 {
     824      109704 :                     switch (nBandDataSize)
     825             :                     {
     826      109624 :                         case 2:
     827      109624 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     828             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     829             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     830             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     831             :                                 nBlockXSize, poBlock, nLBlockY);
     832      109624 :                             break;
     833          22 :                         case 4:
     834          22 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     835             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     836             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     837             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     838             :                                 nBlockXSize, poBlock, nLBlockY);
     839          22 :                             break;
     840          56 :                         case 8:
     841          56 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     842             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     843             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     844             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     845             :                                 nBlockXSize, poBlock, nLBlockY);
     846          56 :                             break;
     847           2 :                         case 16:
     848           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     849             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     850             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     851             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     852             :                                 nBlockXSize, poBlock, nLBlockY);
     853           2 :                             break;
     854           0 :                         default:
     855           0 :                             CPLAssert(false);
     856             :                             break;
     857             :                     }
     858             :                 }
     859             :                 else
     860             :                 {
     861         205 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     862             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     863             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     864             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     865             :                 }
     866      695677 :                 if (!bRet)
     867           1 :                     eErr = CE_Failure;
     868             :             }
     869             :             else
     870             :             {
     871     1360260 :                 double dfSrcX = dfSrcXStart;
     872   582293000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     873   580933000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     874             :                 {
     875             :                     // TODO?: try to avoid the clamping for most iterations
     876             :                     const int iSrcX = static_cast<int>(
     877  1161870000 :                         std::min(std::max(0.0, dfSrcX),
     878   580933000 :                                  static_cast<double>(nRasterXSize - 1)));
     879             : 
     880             :                     /* --------------------------------------------------------------------
     881             :                      */
     882             :                     /*      Ensure we have the appropriate block loaded. */
     883             :                     /* --------------------------------------------------------------------
     884             :                      */
     885   580933000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     886             :                     {
     887     1702800 :                         const int nLBlockX = iSrcX / nBlockXSize;
     888     1702800 :                         nStartBlockX = nLBlockX * nBlockXSize;
     889             : 
     890     1702800 :                         if (poBlock != nullptr)
     891     1581100 :                             poBlock->DropLock();
     892             : 
     893     1702800 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     894     1702800 :                         if (poBlock == nullptr)
     895             :                         {
     896           9 :                             eErr = CE_Failure;
     897           9 :                             break;
     898             :                         }
     899             : 
     900             :                         pabySrcBlock =
     901     1702790 :                             static_cast<GByte *>(poBlock->GetDataRef());
     902             :                     }
     903   580933000 :                     const GPtrDiff_t nDiffX =
     904   580933000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     905             : 
     906             :                     /* --------------------------------------------------------------------
     907             :                      */
     908             :                     /*      Copy over this pixel of data. */
     909             :                     /* --------------------------------------------------------------------
     910             :                      */
     911             : 
     912   580933000 :                     if (bByteCopy)
     913             :                     {
     914   527231000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     915   527231000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     916   527231000 :                             pabySrcBlock[iSrcOffset];
     917             :                     }
     918    53701600 :                     else if (eDataType == eBufType)
     919             :                     {
     920    48225600 :                         GPtrDiff_t iSrcOffset =
     921    48225600 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     922    48225600 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     923    48225600 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     924             :                     }
     925             :                     else
     926             :                     {
     927             :                         // Type to type conversion ...
     928     5476050 :                         GPtrDiff_t iSrcOffset =
     929     5476050 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     930     5476050 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
     931             :                                         static_cast<GByte *>(pData) +
     932     5476050 :                                             iBufOffset,
     933             :                                         eBufType, 0, 1);
     934             :                     }
     935             : 
     936   580933000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     937             :                 }
     938             :             }
     939     2055940 :             if (eErr == CE_Failure)
     940          11 :                 break;
     941             : 
     942     2287020 :             if (psExtraArg->pfnProgress != nullptr &&
     943      231086 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     944             :                                          psExtraArg->pProgressData))
     945             :             {
     946           1 :                 eErr = CE_Failure;
     947           1 :                 break;
     948             :             }
     949             :         }
     950             :     }
     951             : 
     952      562134 :     if (poBlock != nullptr)
     953      562124 :         poBlock->DropLock();
     954             : 
     955      562134 :     return eErr;
     956             : }
     957             : 
     958             : /************************************************************************/
     959             : /*                         GDALRasterIOTransformer()                    */
     960             : /************************************************************************/
     961             : 
     962             : struct GDALRasterIOTransformerStruct
     963             : {
     964             :     double dfXOff;
     965             :     double dfYOff;
     966             :     double dfXRatioDstToSrc;
     967             :     double dfYRatioDstToSrc;
     968             : };
     969             : 
     970        6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
     971             :                                    int nPointCount, double *x, double *y,
     972             :                                    double * /* z */, int *panSuccess)
     973             : {
     974        6748 :     GDALRasterIOTransformerStruct *psParams =
     975             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
     976        6748 :     if (bDstToSrc)
     977             :     {
     978      252996 :         for (int i = 0; i < nPointCount; i++)
     979             :         {
     980      246836 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
     981      246836 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
     982      246836 :             panSuccess[i] = TRUE;
     983             :         }
     984             :     }
     985             :     else
     986             :     {
     987        1176 :         for (int i = 0; i < nPointCount; i++)
     988             :         {
     989         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
     990         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
     991         588 :             panSuccess[i] = TRUE;
     992             :         }
     993             :     }
     994        6748 :     return TRUE;
     995             : }
     996             : 
     997             : /************************************************************************/
     998             : /*                          RasterIOResampled()                         */
     999             : /************************************************************************/
    1000             : 
    1001             : //! @cond Doxygen_Suppress
    1002        3045 : CPLErr GDALRasterBand::RasterIOResampled(
    1003             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1004             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1005             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1006             : {
    1007             :     // Determine if we use warping resampling or overview resampling
    1008             :     const bool bUseWarp =
    1009        3045 :         (GDALDataTypeIsComplex(eDataType) &&
    1010        3202 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1011         157 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1012             : 
    1013        3045 :     double dfXOff = nXOff;
    1014        3045 :     double dfYOff = nYOff;
    1015        3045 :     double dfXSize = nXSize;
    1016        3045 :     double dfYSize = nYSize;
    1017        3045 :     if (psExtraArg->bFloatingPointWindowValidity)
    1018             :     {
    1019        2586 :         dfXOff = psExtraArg->dfXOff;
    1020        2586 :         dfYOff = psExtraArg->dfYOff;
    1021        2586 :         dfXSize = psExtraArg->dfXSize;
    1022        2586 :         dfYSize = psExtraArg->dfYSize;
    1023             :     }
    1024             : 
    1025        3045 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1026        3045 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1027             : 
    1028             :     // Determine the coordinates in the "virtual" output raster to see
    1029             :     // if there are not integers, in which case we will use them as a shift
    1030             :     // so that subwindow extracts give the exact same results as entire raster
    1031             :     // scaling.
    1032        3045 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1033        3045 :     bool bHasXOffVirtual = false;
    1034        3045 :     int nDestXOffVirtual = 0;
    1035        3045 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1036             :     {
    1037        2717 :         bHasXOffVirtual = true;
    1038        2717 :         dfXOff = nXOff;
    1039        2717 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1040             :     }
    1041             : 
    1042        3045 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1043        3045 :     bool bHasYOffVirtual = false;
    1044        3045 :     int nDestYOffVirtual = 0;
    1045        3045 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1046             :     {
    1047        2712 :         bHasYOffVirtual = true;
    1048        2712 :         dfYOff = nYOff;
    1049        2712 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1050             :     }
    1051             : 
    1052             :     // Create a MEM dataset that wraps the output buffer.
    1053             :     GDALDataset *poMEMDS;
    1054        3045 :     void *pTempBuffer = nullptr;
    1055        3045 :     GSpacing nPSMem = nPixelSpace;
    1056        3045 :     GSpacing nLSMem = nLineSpace;
    1057        3045 :     void *pDataMem = pData;
    1058        3045 :     GDALDataType eDTMem = eBufType;
    1059        3045 :     if (eBufType != eDataType)
    1060             :     {
    1061          40 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1062          40 :         nLSMem = nPSMem * nBufXSize;
    1063             :         pTempBuffer =
    1064          40 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1065          40 :         if (pTempBuffer == nullptr)
    1066           0 :             return CE_Failure;
    1067          40 :         pDataMem = pTempBuffer;
    1068          40 :         eDTMem = eDataType;
    1069             :     }
    1070             : 
    1071             :     poMEMDS =
    1072        3045 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1073             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1074        3045 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1075        3045 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1076        3045 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1077             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1078        3045 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1079             : 
    1080        3045 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1081        3045 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1082        3045 :     if (pszNBITS)
    1083           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1084           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1085             : 
    1086        3044 :     CPLErr eErr = CE_None;
    1087             : 
    1088             :     // Do the resampling.
    1089        3044 :     if (bUseWarp)
    1090             :     {
    1091         149 :         int bHasNoData = FALSE;
    1092         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1093             : 
    1094         149 :         VRTDatasetH hVRTDS = nullptr;
    1095         149 :         GDALRasterBandH hVRTBand = nullptr;
    1096         149 :         if (GetDataset() == nullptr)
    1097             :         {
    1098             :             /* Create VRT dataset that wraps the whole dataset */
    1099           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1100           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1101           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1102           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1103             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1104             :                                VRT_NODATA_UNSET);
    1105             : 
    1106             :             /* Add a mask band if needed */
    1107           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1108             :             {
    1109           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1110             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1111             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1112             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1113           0 :                             ->GetMaskBand());
    1114           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1115           0 :                                                  nRasterYSize, 0, 0,
    1116           0 :                                                  nRasterXSize, nRasterYSize);
    1117             :             }
    1118             :         }
    1119             : 
    1120         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1121         149 :         switch (psExtraArg->eResampleAlg)
    1122             :         {
    1123           0 :             case GRIORA_NearestNeighbour:
    1124           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1125           0 :                 break;
    1126         147 :             case GRIORA_Bilinear:
    1127         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1128         147 :                 break;
    1129           0 :             case GRIORA_Cubic:
    1130           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1131           0 :                 break;
    1132           0 :             case GRIORA_CubicSpline:
    1133           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1134           0 :                 break;
    1135           0 :             case GRIORA_Lanczos:
    1136           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1137           0 :                 break;
    1138           0 :             case GRIORA_Average:
    1139           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1140           0 :                 break;
    1141           2 :             case GRIORA_RMS:
    1142           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1143           2 :                 break;
    1144           0 :             case GRIORA_Mode:
    1145           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1146           0 :                 break;
    1147           0 :             default:
    1148           0 :                 CPLAssert(false);
    1149             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1150             :                 break;
    1151             :         }
    1152         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1153         149 :         psWarpOptions->hDstDS = poMEMDS;
    1154         149 :         psWarpOptions->nBandCount = 1;
    1155         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1156         149 :         int nDstBandNumber = 1;
    1157         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1158         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1159         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1160         149 :                                          ? psExtraArg->pfnProgress
    1161             :                                          : GDALDummyProgress;
    1162         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1163         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1164         149 :         if (bHasNoData)
    1165             :         {
    1166           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1167             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1168           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1169             :             {
    1170           0 :                 psWarpOptions->padfSrcNoDataReal =
    1171           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1172           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1173             :             }
    1174             : 
    1175           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1176             :             {
    1177           0 :                 psWarpOptions->padfDstNoDataReal =
    1178           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1179           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1180             :             }
    1181             :         }
    1182             : 
    1183             :         GDALRasterIOTransformerStruct sTransformer;
    1184         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1185         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1186         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1187         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1188         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1189             : 
    1190             :         GDALWarpOperationH hWarpOperation =
    1191         149 :             GDALCreateWarpOperation(psWarpOptions);
    1192         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1193             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1194         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1195             : 
    1196         149 :         psWarpOptions->panSrcBands = nullptr;
    1197         149 :         psWarpOptions->panDstBands = nullptr;
    1198         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1199             : 
    1200         149 :         if (hVRTDS)
    1201           0 :             GDALClose(hVRTDS);
    1202             :     }
    1203             :     else
    1204             :     {
    1205        2895 :         const char *pszResampling =
    1206        3551 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1207         780 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1208         246 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1209         239 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1210         172 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1211          95 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1212          43 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1213           3 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1214             :                                                                : "UNKNOWN";
    1215             : 
    1216        2895 :         int nKernelRadius = 0;
    1217             :         GDALResampleFunction pfnResampleFunc =
    1218        2895 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1219        2896 :         CPLAssert(pfnResampleFunc);
    1220             :         GDALDataType eWrkDataType =
    1221        2896 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1222        2895 :         int nHasNoData = 0;
    1223        2895 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1224        2896 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1225        2896 :         if (!bHasNoData)
    1226        2806 :             dfNoDataValue = 0.0;
    1227             : 
    1228        2896 :         int nDstBlockXSize = nBufXSize;
    1229        2896 :         int nDstBlockYSize = nBufYSize;
    1230        2896 :         int nFullResXChunk = 0;
    1231        2896 :         int nFullResYChunk = 0;
    1232             :         while (true)
    1233             :         {
    1234        2907 :             nFullResXChunk =
    1235        2907 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1236        2907 :             nFullResYChunk =
    1237        2907 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1238        2907 :             if (nFullResXChunk > nRasterXSize)
    1239        2664 :                 nFullResXChunk = nRasterXSize;
    1240        2907 :             if (nFullResYChunk > nRasterYSize)
    1241         265 :                 nFullResYChunk = nRasterYSize;
    1242        2907 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1243        2853 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1244             :                  1024 * 1024))
    1245             :                 break;
    1246             :             // When operating on the full width of a raster whose block width is
    1247             :             // the raster width, prefer doing chunks in height.
    1248          11 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1249             :                 nDstBlockYSize > 1)
    1250           0 :                 nDstBlockYSize /= 2;
    1251             :             /* Otherwise cut the maximal dimension */
    1252          11 :             else if (nDstBlockXSize > 1 &&
    1253           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1254          11 :                 nDstBlockXSize /= 2;
    1255             :             else
    1256           0 :                 nDstBlockYSize /= 2;
    1257             :         }
    1258             : 
    1259        2896 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1260        2896 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1261        2896 :         if (nOvrXFactor == 0)
    1262        2029 :             nOvrXFactor = 1;
    1263        2896 :         if (nOvrYFactor == 0)
    1264        2028 :             nOvrYFactor = 1;
    1265        2896 :         int nFullResXSizeQueried =
    1266        2896 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1267        2896 :         int nFullResYSizeQueried =
    1268        2896 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1269             : 
    1270        2896 :         if (nFullResXSizeQueried > nRasterXSize)
    1271        2555 :             nFullResXSizeQueried = nRasterXSize;
    1272        2896 :         if (nFullResYSizeQueried > nRasterYSize)
    1273         154 :             nFullResYSizeQueried = nRasterYSize;
    1274             : 
    1275             :         void *pChunk =
    1276        2896 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1277             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1278        2896 :         GByte *pabyChunkNoDataMask = nullptr;
    1279             : 
    1280        2896 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1281        2895 :         int l_nMaskFlags = GetMaskFlags();
    1282             : 
    1283        2895 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1284        2895 :         if (bUseNoDataMask)
    1285             :         {
    1286         157 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1287             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1288             :         }
    1289        2896 :         if (pChunk == nullptr ||
    1290         158 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1291             :         {
    1292           1 :             GDALClose(poMEMDS);
    1293           0 :             CPLFree(pChunk);
    1294           0 :             CPLFree(pabyChunkNoDataMask);
    1295           0 :             VSIFree(pTempBuffer);
    1296           0 :             return CE_Failure;
    1297             :         }
    1298             : 
    1299        2895 :         int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
    1300        2895 :                            ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
    1301        2895 :         int nBlocksDone = 0;
    1302             : 
    1303             :         int nDstYOff;
    1304        5791 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1305        2896 :              nDstYOff += nDstBlockYSize)
    1306             :         {
    1307             :             int nDstYCount;
    1308        2896 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1309        2895 :                 nDstYCount = nDstBlockYSize;
    1310             :             else
    1311           1 :                 nDstYCount = nBufYSize - nDstYOff;
    1312             : 
    1313        2896 :             int nChunkYOff =
    1314        2896 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1315        2896 :             int nChunkYOff2 = nYOff + 1 +
    1316        2896 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1317             :                                                     dfYRatioDstToSrc));
    1318        2896 :             if (nChunkYOff2 > nRasterYSize)
    1319         377 :                 nChunkYOff2 = nRasterYSize;
    1320        2896 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1321        2896 :             CPLAssert(nYCount <= nFullResYChunk);
    1322             : 
    1323        2896 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1324        2896 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1325        2896 :             if (nChunkYOffQueried < 0)
    1326             :             {
    1327         277 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1328         277 :                 nChunkYOffQueried = 0;
    1329             :             }
    1330        2896 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1331         380 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1332        2896 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1333             : 
    1334        2896 :             int nDstXOff = 0;
    1335        5792 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1336        2896 :                  nDstXOff += nDstBlockXSize)
    1337             :             {
    1338        2896 :                 int nDstXCount = 0;
    1339        2896 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1340        2895 :                     nDstXCount = nDstBlockXSize;
    1341             :                 else
    1342           1 :                     nDstXCount = nBufXSize - nDstXOff;
    1343             : 
    1344        2896 :                 int nChunkXOff =
    1345        2896 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1346        2896 :                 int nChunkXOff2 =
    1347        2896 :                     nXOff + 1 +
    1348        2896 :                     static_cast<int>(
    1349        2896 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1350        2896 :                 if (nChunkXOff2 > nRasterXSize)
    1351        2676 :                     nChunkXOff2 = nRasterXSize;
    1352        2896 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1353        2896 :                 CPLAssert(nXCount <= nFullResXChunk);
    1354             : 
    1355        2896 :                 int nChunkXOffQueried =
    1356        2896 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1357        2896 :                 int nChunkXSizeQueried =
    1358        2896 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1359        2896 :                 if (nChunkXOffQueried < 0)
    1360             :                 {
    1361        2580 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1362        2580 :                     nChunkXOffQueried = 0;
    1363             :                 }
    1364        2896 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1365        2567 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1366        2896 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1367             : 
    1368             :                 // Read the source buffers.
    1369        2896 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1370             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1371             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1372             :                                 eWrkDataType, 0, 0, nullptr);
    1373             : 
    1374        2896 :                 bool bSkipResample = false;
    1375        2896 :                 bool bNoDataMaskFullyOpaque = false;
    1376        2896 :                 if (eErr == CE_None && bUseNoDataMask)
    1377             :                 {
    1378         158 :                     eErr = poMaskBand->RasterIO(
    1379             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1380             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1381             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1382             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1383             : 
    1384             :                     /* Optimizations if mask if fully opaque or transparent */
    1385         158 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1386         158 :                     GByte bVal = pabyChunkNoDataMask[0];
    1387         158 :                     int i = 1;
    1388     3751460 :                     for (; i < nPixels; i++)
    1389             :                     {
    1390     3751410 :                         if (pabyChunkNoDataMask[i] != bVal)
    1391         104 :                             break;
    1392             :                     }
    1393         158 :                     if (i == nPixels)
    1394             :                     {
    1395          54 :                         if (bVal == 0)
    1396             :                         {
    1397         712 :                             for (int j = 0; j < nDstYCount; j++)
    1398             :                             {
    1399         686 :                                 GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
    1400             :                                                 static_cast<GByte *>(pDataMem) +
    1401         686 :                                                     nLSMem * (j + nDstYOff) +
    1402         686 :                                                     nDstXOff * nPSMem,
    1403             :                                                 eDTMem,
    1404             :                                                 static_cast<int>(nPSMem),
    1405             :                                                 nDstXCount);
    1406             :                             }
    1407          26 :                             bSkipResample = true;
    1408             :                         }
    1409             :                         else
    1410             :                         {
    1411          28 :                             bNoDataMaskFullyOpaque = true;
    1412             :                         }
    1413             :                     }
    1414             :                 }
    1415             : 
    1416        2896 :                 if (!bSkipResample && eErr == CE_None)
    1417             :                 {
    1418        2867 :                     const bool bPropagateNoData = false;
    1419        2867 :                     void *pDstBuffer = nullptr;
    1420        2867 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1421             :                     GDALRasterBand *poMEMBand =
    1422        2867 :                         GDALRasterBand::FromHandle(hMEMBand);
    1423        2867 :                     GDALOverviewResampleArgs args;
    1424        2867 :                     args.eSrcDataType = eDataType;
    1425        2867 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1426        2867 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1427        2867 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1428        2867 :                     args.nOvrNBITS = nNBITS;
    1429        2867 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1430        2867 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1431        2867 :                     args.dfSrcXDelta =
    1432        2867 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1433        2867 :                     args.dfSrcYDelta =
    1434        2867 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1435        2867 :                     args.eWrkDataType = eWrkDataType;
    1436        2867 :                     args.pabyChunkNodataMask =
    1437        2867 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1438        2867 :                     args.nChunkXOff =
    1439        2867 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1440        2867 :                     args.nChunkXSize = nChunkXSizeQueried;
    1441        2867 :                     args.nChunkYOff =
    1442        2867 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1443        2867 :                     args.nChunkYSize = nChunkYSizeQueried;
    1444        2867 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1445        2867 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1446        2867 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1447        2867 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1448        2867 :                     args.pszResampling = pszResampling;
    1449        2867 :                     args.bHasNoData = bHasNoData;
    1450        2867 :                     args.dfNoDataValue = dfNoDataValue;
    1451        2867 :                     args.poColorTable = GetColorTable();
    1452        2867 :                     args.bPropagateNoData = bPropagateNoData;
    1453        2867 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1454             :                                            &eDstBufferDataType);
    1455        2867 :                     if (eErr == CE_None)
    1456             :                     {
    1457        2867 :                         eErr = poMEMBand->RasterIO(
    1458             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1459             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1460             :                             pDstBuffer, nDstXCount, nDstYCount,
    1461             :                             eDstBufferDataType, 0, 0, nullptr);
    1462             :                     }
    1463        2867 :                     CPLFree(pDstBuffer);
    1464             :                 }
    1465             : 
    1466        2896 :                 nBlocksDone++;
    1467        3321 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1468         425 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1469             :                                              "", psExtraArg->pProgressData))
    1470             :                 {
    1471           1 :                     eErr = CE_Failure;
    1472             :                 }
    1473             :             }
    1474             :         }
    1475             : 
    1476        2895 :         CPLFree(pChunk);
    1477        2896 :         CPLFree(pabyChunkNoDataMask);
    1478             :     }
    1479             : 
    1480        3045 :     if (eBufType != eDataType)
    1481             :     {
    1482          40 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1483             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1484             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1485             :             nullptr));
    1486             :     }
    1487        3045 :     GDALClose(poMEMDS);
    1488        3045 :     VSIFree(pTempBuffer);
    1489             : 
    1490        3045 :     return eErr;
    1491             : }
    1492             : 
    1493             : /************************************************************************/
    1494             : /*                          RasterIOResampled()                         */
    1495             : /************************************************************************/
    1496             : 
    1497         757 : CPLErr GDALDataset::RasterIOResampled(
    1498             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1499             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1500             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1501             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1502             : 
    1503             : {
    1504             : #if 0
    1505             :     // Determine if we use warping resampling or overview resampling
    1506             :     bool bUseWarp = false;
    1507             :     if( GDALDataTypeIsComplex( eDataType ) )
    1508             :         bUseWarp = true;
    1509             : #endif
    1510             : 
    1511         757 :     double dfXOff = nXOff;
    1512         757 :     double dfYOff = nYOff;
    1513         757 :     double dfXSize = nXSize;
    1514         757 :     double dfYSize = nYSize;
    1515         757 :     if (psExtraArg->bFloatingPointWindowValidity)
    1516             :     {
    1517         636 :         dfXOff = psExtraArg->dfXOff;
    1518         636 :         dfYOff = psExtraArg->dfYOff;
    1519         636 :         dfXSize = psExtraArg->dfXSize;
    1520         636 :         dfYSize = psExtraArg->dfYSize;
    1521             :     }
    1522             : 
    1523         757 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1524         757 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1525             : 
    1526             :     // Determine the coordinates in the "virtual" output raster to see
    1527             :     // if there are not integers, in which case we will use them as a shift
    1528             :     // so that subwindow extracts give the exact same results as entire raster
    1529             :     // scaling.
    1530         757 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1531         757 :     bool bHasXOffVirtual = false;
    1532         757 :     int nDestXOffVirtual = 0;
    1533         757 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1534             :     {
    1535         623 :         bHasXOffVirtual = true;
    1536         623 :         dfXOff = nXOff;
    1537         623 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1538             :     }
    1539             : 
    1540         757 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1541         757 :     bool bHasYOffVirtual = false;
    1542         757 :     int nDestYOffVirtual = 0;
    1543         757 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1544             :     {
    1545         583 :         bHasYOffVirtual = true;
    1546         583 :         dfYOff = nYOff;
    1547         583 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1548             :     }
    1549             : 
    1550             :     // Create a MEM dataset that wraps the output buffer.
    1551             :     GDALDataset *poMEMDS =
    1552         757 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1553             :                            nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
    1554             :     GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
    1555         752 :         CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
    1556         761 :     int nNBITS = 0;
    1557        2336 :     for (int i = 0; i < nBandCount; i++)
    1558             :     {
    1559        1585 :         char szBuffer[32] = {'\0'};
    1560        3181 :         int nRet = CPLPrintPointer(
    1561             :             szBuffer,
    1562        1585 :             static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
    1563        1585 :                 nLineSpace * nDestYOffVirtual + nBandSpace * i,
    1564             :             sizeof(szBuffer));
    1565        1596 :         szBuffer[nRet] = 0;
    1566             : 
    1567        1596 :         char szBuffer0[64] = {'\0'};
    1568        1596 :         snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
    1569             : 
    1570        1596 :         char szBuffer1[64] = {'\0'};
    1571        1596 :         snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
    1572             :                  static_cast<GIntBig>(nPixelSpace));
    1573             : 
    1574        1596 :         char szBuffer2[64] = {'\0'};
    1575        1596 :         snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
    1576             :                  static_cast<GIntBig>(nLineSpace));
    1577             : 
    1578        1596 :         char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
    1579             : 
    1580        1596 :         poMEMDS->AddBand(eBufType, apszOptions);
    1581             : 
    1582        1591 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1583        1582 :         papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
    1584             :         const char *pszNBITS =
    1585        1575 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1586        1576 :         if (pszNBITS)
    1587             :         {
    1588           0 :             nNBITS = atoi(pszNBITS);
    1589           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1590           0 :                                                            "IMAGE_STRUCTURE");
    1591             :         }
    1592             :     }
    1593             : 
    1594         751 :     CPLErr eErr = CE_None;
    1595             : 
    1596             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1597             :     // Looks like this code was initially added as disable by copying
    1598             :     // from RasterIO here:
    1599             :     // https://trac.osgeo.org/gdal/changeset/29572
    1600             : #if 0
    1601             :     // Do the resampling.
    1602             :     if( bUseWarp )
    1603             :     {
    1604             :         VRTDatasetH hVRTDS = nullptr;
    1605             :         GDALRasterBandH hVRTBand = nullptr;
    1606             :         if( GetDataset() == nullptr )
    1607             :         {
    1608             :             /* Create VRT dataset that wraps the whole dataset */
    1609             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1610             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1611             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1612             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1613             :                                 (GDALRasterBandH)this,
    1614             :                                 0, 0,
    1615             :                                 nRasterXSize, nRasterYSize,
    1616             :                                 0, 0,
    1617             :                                 nRasterXSize, nRasterYSize,
    1618             :                                 nullptr, VRT_NODATA_UNSET );
    1619             : 
    1620             :             /* Add a mask band if needed */
    1621             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1622             :             {
    1623             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1624             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1625             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1626             :                 poVRTMaskBand->
    1627             :                     AddMaskBandSource( this,
    1628             :                                     0, 0,
    1629             :                                     nRasterXSize, nRasterYSize,
    1630             :                                     0, 0,
    1631             :                                     nRasterXSize, nRasterYSize);
    1632             :             }
    1633             :         }
    1634             : 
    1635             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1636             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1637             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1638             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1639             :         psWarpOptions->nBandCount = 1;
    1640             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1641             :         int nDstBandNumber = 1;
    1642             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1643             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1644             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1645             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1646             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1647             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1648             :         GDALRasterIOTransformerStruct sTransformer;
    1649             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1650             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1651             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1652             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1653             :         psWarpOptions->pTransformerArg = &sTransformer;
    1654             : 
    1655             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1656             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1657             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1658             :                                       nBufXSize, nBufYSize );
    1659             :         GDALDestroyWarpOperation( hWarpOperation );
    1660             : 
    1661             :         psWarpOptions->panSrcBands = nullptr;
    1662             :         psWarpOptions->panDstBands = nullptr;
    1663             :         GDALDestroyWarpOptions( psWarpOptions );
    1664             : 
    1665             :         if( hVRTDS )
    1666             :             GDALClose(hVRTDS);
    1667             :     }
    1668             :     else
    1669             : #endif
    1670             :     {
    1671         751 :         const char *pszResampling =
    1672        1385 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1673         634 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1674           0 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1675           0 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1676           0 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1677           0 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1678           0 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1679           0 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1680             :                                                                : "UNKNOWN";
    1681             : 
    1682         751 :         GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1683         740 :         GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1684             :         int nBlockXSize, nBlockYSize;
    1685         745 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1686             : 
    1687             :         int nKernelRadius;
    1688             :         GDALResampleFunction pfnResampleFunc =
    1689         739 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1690         743 :         CPLAssert(pfnResampleFunc);
    1691             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1692             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1693             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1694             : #endif
    1695             :         GDALDataType eWrkDataType =
    1696         743 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1697             : 
    1698         736 :         int nDstBlockXSize = nBufXSize;
    1699         736 :         int nDstBlockYSize = nBufYSize;
    1700             :         int nFullResXChunk, nFullResYChunk;
    1701             :         while (true)
    1702             :         {
    1703         736 :             nFullResXChunk =
    1704         736 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1705         736 :             nFullResYChunk =
    1706         736 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1707         736 :             if (nFullResXChunk > nRasterXSize)
    1708         557 :                 nFullResXChunk = nRasterXSize;
    1709         736 :             if (nFullResYChunk > nRasterYSize)
    1710          43 :                 nFullResYChunk = nRasterYSize;
    1711         736 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1712         734 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1713             :                  1024 * 1024))
    1714             :                 break;
    1715             :             // When operating on the full width of a raster whose block width is
    1716             :             // the raster width, prefer doing chunks in height.
    1717           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1718             :                 nDstBlockYSize > 1)
    1719           0 :                 nDstBlockYSize /= 2;
    1720             :             /* Otherwise cut the maximal dimension */
    1721           0 :             else if (nDstBlockXSize > 1 &&
    1722           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1723           0 :                 nDstBlockXSize /= 2;
    1724             :             else
    1725           0 :                 nDstBlockYSize /= 2;
    1726             :         }
    1727             : 
    1728        1477 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1729         736 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1730         741 :         if (nOvrFactor == 0)
    1731          94 :             nOvrFactor = 1;
    1732         741 :         int nFullResXSizeQueried =
    1733         741 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1734         741 :         int nFullResYSizeQueried =
    1735         741 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1736             : 
    1737         741 :         if (nFullResXSizeQueried > nRasterXSize)
    1738         587 :             nFullResXSizeQueried = nRasterXSize;
    1739         741 :         if (nFullResYSizeQueried > nRasterYSize)
    1740          46 :             nFullResYSizeQueried = nRasterYSize;
    1741             : 
    1742         741 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1743             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1744             :                               nBandCount),
    1745             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1746         754 :         GByte *pabyChunkNoDataMask = nullptr;
    1747             : 
    1748         754 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1749         754 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1750             : 
    1751         756 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1752         756 :         if (bUseNoDataMask)
    1753             :         {
    1754         489 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1755             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1756             :         }
    1757         756 :         if (pChunk == nullptr ||
    1758         489 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1759             :         {
    1760          17 :             GDALClose(poMEMDS);
    1761           0 :             CPLFree(pChunk);
    1762           0 :             CPLFree(pabyChunkNoDataMask);
    1763           0 :             CPLFree(papoDstBands);
    1764           0 :             return CE_Failure;
    1765             :         }
    1766             : 
    1767         739 :         int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
    1768         739 :                            ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
    1769         739 :         int nBlocksDone = 0;
    1770             : 
    1771             :         int nDstYOff;
    1772        1499 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1773         760 :              nDstYOff += nDstBlockYSize)
    1774             :         {
    1775             :             int nDstYCount;
    1776         739 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1777         739 :                 nDstYCount = nDstBlockYSize;
    1778             :             else
    1779           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1780             : 
    1781         739 :             int nChunkYOff =
    1782         739 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1783         739 :             int nChunkYOff2 = nYOff + 1 +
    1784         739 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1785             :                                                     dfYRatioDstToSrc));
    1786         739 :             if (nChunkYOff2 > nRasterYSize)
    1787          89 :                 nChunkYOff2 = nRasterYSize;
    1788         739 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1789         739 :             CPLAssert(nYCount <= nFullResYChunk);
    1790             : 
    1791         739 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1792         739 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1793         739 :             if (nChunkYOffQueried < 0)
    1794             :             {
    1795          92 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1796          92 :                 nChunkYOffQueried = 0;
    1797             :             }
    1798         739 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1799         107 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1800         739 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1801             : 
    1802             :             int nDstXOff;
    1803        1498 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1804         759 :                  nDstXOff += nDstBlockXSize)
    1805             :             {
    1806             :                 int nDstXCount;
    1807         738 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1808         736 :                     nDstXCount = nDstBlockXSize;
    1809             :                 else
    1810           2 :                     nDstXCount = nBufXSize - nDstXOff;
    1811             : 
    1812         738 :                 int nChunkXOff =
    1813         738 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1814         738 :                 int nChunkXOff2 =
    1815         738 :                     nXOff + 1 +
    1816         738 :                     static_cast<int>(
    1817         738 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1818         738 :                 if (nChunkXOff2 > nRasterXSize)
    1819         588 :                     nChunkXOff2 = nRasterXSize;
    1820         738 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1821         738 :                 CPLAssert(nXCount <= nFullResXChunk);
    1822             : 
    1823         738 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1824         738 :                 int nChunkXSizeQueried =
    1825         738 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1826         738 :                 if (nChunkXOffQueried < 0)
    1827             :                 {
    1828         574 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1829         574 :                     nChunkXOffQueried = 0;
    1830             :                 }
    1831         738 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1832         595 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1833         738 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1834             : 
    1835         738 :                 bool bSkipResample = false;
    1836         738 :                 bool bNoDataMaskFullyOpaque = false;
    1837         738 :                 if (eErr == CE_None && bUseNoDataMask)
    1838             :                 {
    1839         489 :                     eErr = poMaskBand->RasterIO(
    1840             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1841             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1842             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1843             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1844             : 
    1845             :                     /* Optimizations if mask if fully opaque or transparent */
    1846         489 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1847         489 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1848         489 :                     int i = 1;  // Used after for.
    1849    12776300 :                     for (; i < nPixels; i++)
    1850             :                     {
    1851    12775900 :                         if (pabyChunkNoDataMask[i] != bVal)
    1852          72 :                             break;
    1853             :                     }
    1854         489 :                     if (i == nPixels)
    1855             :                     {
    1856         417 :                         if (bVal == 0)
    1857             :                         {
    1858         373 :                             GByte abyZero[16] = {0};
    1859         780 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1860             :                             {
    1861        3499 :                                 for (int j = 0; j < nDstYCount; j++)
    1862             :                                 {
    1863        3092 :                                     GDALCopyWords64(
    1864             :                                         abyZero, GDT_Byte, 0,
    1865             :                                         static_cast<GByte *>(pData) +
    1866        3092 :                                             iBand * nBandSpace +
    1867        3092 :                                             nLineSpace * (j + nDstYOff) +
    1868        3092 :                                             nDstXOff * nPixelSpace,
    1869             :                                         eBufType, static_cast<int>(nPixelSpace),
    1870             :                                         nDstXCount);
    1871             :                                 }
    1872             :                             }
    1873         373 :                             bSkipResample = true;
    1874             :                         }
    1875             :                         else
    1876             :                         {
    1877          44 :                             bNoDataMaskFullyOpaque = true;
    1878             :                         }
    1879             :                     }
    1880             :                 }
    1881             : 
    1882         738 :                 if (!bSkipResample && eErr == CE_None)
    1883             :                 {
    1884             :                     /* Read the source buffers */
    1885         368 :                     eErr = RasterIO(
    1886             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1887             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1888             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1889             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1890             :                 }
    1891             : 
    1892             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1893             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1894             :                     eErr == CE_None)
    1895             :                 {
    1896             :                     eErr = pfnResampleFuncMultiBands(
    1897             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1898             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1899             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1900             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1901             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1902             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1903             :                         nChunkXSizeQueried,
    1904             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1905             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1906             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1907             :                         nDstYOff + nDestYOffVirtual,
    1908             :                         nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
    1909             :                         pszResampling, FALSE /*bHasNoData*/,
    1910             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1911             :                         eDataType);
    1912             :                 }
    1913             :                 else
    1914             : #endif
    1915             :                 {
    1916             :                     size_t nChunkBandOffset =
    1917         751 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1918         751 :                         nChunkYSizeQueried *
    1919         751 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1920        1949 :                     for (int i = 0;
    1921        1949 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1922             :                          i++)
    1923             :                     {
    1924        1190 :                         const bool bPropagateNoData = false;
    1925        1190 :                         void *pDstBuffer = nullptr;
    1926        1190 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1927             :                         GDALRasterBand *poMEMBand =
    1928        1190 :                             poMEMDS->GetRasterBand(i + 1);
    1929        1191 :                         GDALOverviewResampleArgs args;
    1930        1191 :                         args.eSrcDataType = eDataType;
    1931        1191 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1932        1191 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1933        1189 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1934        1186 :                         args.nOvrNBITS = nNBITS;
    1935        1186 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1936        1186 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1937        1186 :                         args.dfSrcXDelta =
    1938        1186 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1939        1186 :                         args.dfSrcYDelta =
    1940        1186 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1941        1186 :                         args.eWrkDataType = eWrkDataType;
    1942        1186 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1943        1186 :                                                        ? nullptr
    1944             :                                                        : pabyChunkNoDataMask;
    1945        1186 :                         args.nChunkXOff =
    1946        1186 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1947        1186 :                         args.nChunkXSize = nChunkXSizeQueried;
    1948        1186 :                         args.nChunkYOff =
    1949        1186 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1950        1186 :                         args.nChunkYSize = nChunkYSizeQueried;
    1951        1186 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1952        1186 :                         args.nDstXOff2 =
    1953        1186 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1954        1186 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1955        1186 :                         args.nDstYOff2 =
    1956        1186 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1957        1186 :                         args.pszResampling = pszResampling;
    1958        1186 :                         args.bHasNoData = false;
    1959        1186 :                         args.dfNoDataValue = 0.0;
    1960        1186 :                         args.poColorTable = nullptr;
    1961        1186 :                         args.bPropagateNoData = bPropagateNoData;
    1962             : 
    1963             :                         eErr =
    1964        2378 :                             pfnResampleFunc(args,
    1965        1186 :                                             reinterpret_cast<GByte *>(pChunk) +
    1966        1186 :                                                 i * nChunkBandOffset,
    1967             :                                             &pDstBuffer, &eDstBufferDataType);
    1968        1192 :                         if (eErr == CE_None)
    1969             :                         {
    1970        1192 :                             eErr = poMEMBand->RasterIO(
    1971             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1972             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1973             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1974             :                                 eDstBufferDataType, 0, 0, nullptr);
    1975             :                         }
    1976        1192 :                         CPLFree(pDstBuffer);
    1977             :                     }
    1978             :                 }
    1979             : 
    1980         759 :                 nBlocksDone++;
    1981        1148 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1982         389 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1983             :                                              "", psExtraArg->pProgressData))
    1984             :                 {
    1985           0 :                     eErr = CE_Failure;
    1986             :                 }
    1987             :             }
    1988             :         }
    1989             : 
    1990         760 :         CPLFree(pChunk);
    1991         758 :         CPLFree(pabyChunkNoDataMask);
    1992             :     }
    1993             : 
    1994         758 :     CPLFree(papoDstBands);
    1995         758 :     GDALClose(poMEMDS);
    1996             : 
    1997         758 :     return eErr;
    1998             : }
    1999             : 
    2000             : //! @endcond
    2001             : 
    2002             : /************************************************************************/
    2003             : /*                           GDALSwapWords()                            */
    2004             : /************************************************************************/
    2005             : 
    2006             : /**
    2007             :  * Byte swap words in-place.
    2008             :  *
    2009             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2010             :  * a memory array.  No assumption is made that the words being swapped are
    2011             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2012             :  * to determine if the current platform is big endian or little endian.  Use
    2013             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2014             :  * of a function call.
    2015             :  *
    2016             :  * @param pData pointer to start of data buffer.
    2017             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2018             :  * @param nWordCount the number of words to be swapped in this call.
    2019             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2020             :  * the next. For packed buffers this is the same as nWordSize.
    2021             :  */
    2022             : 
    2023      497157 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2024             :                                int nWordSkip)
    2025             : 
    2026             : {
    2027      497157 :     if (nWordCount > 0)
    2028      497157 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2029             : 
    2030      497157 :     GByte *pabyData = static_cast<GByte *>(pData);
    2031             : 
    2032      497157 :     switch (nWordSize)
    2033             :     {
    2034        7234 :         case 1:
    2035        7234 :             break;
    2036             : 
    2037      476923 :         case 2:
    2038      476923 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2039   228064000 :             for (int i = 0; i < nWordCount; i++)
    2040             :             {
    2041   227587000 :                 CPL_SWAP16PTR(pabyData);
    2042   227587000 :                 pabyData += nWordSkip;
    2043             :             }
    2044      476923 :             break;
    2045             : 
    2046       10514 :         case 4:
    2047       10514 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2048       10514 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2049             :             {
    2050    29139300 :                 for (int i = 0; i < nWordCount; i++)
    2051             :                 {
    2052    29128800 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2053             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2054    29128800 :                     pabyData += nWordSkip;
    2055       10511 :                 }
    2056             :             }
    2057             :             else
    2058             :             {
    2059           9 :                 for (int i = 0; i < nWordCount; i++)
    2060             :                 {
    2061           6 :                     CPL_SWAP32PTR(pabyData);
    2062           6 :                     pabyData += nWordSkip;
    2063             :                 }
    2064             :             }
    2065       10514 :             break;
    2066             : 
    2067        2486 :         case 8:
    2068        2486 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2069        2486 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2070             :             {
    2071     3358160 :                 for (int i = 0; i < nWordCount; i++)
    2072             :                 {
    2073     3355680 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2074             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2075     3355680 :                     pabyData += nWordSkip;
    2076        2485 :                 }
    2077             :             }
    2078             :             else
    2079             :             {
    2080           3 :                 for (int i = 0; i < nWordCount; i++)
    2081             :                 {
    2082           2 :                     CPL_SWAP64PTR(pabyData);
    2083           2 :                     pabyData += nWordSkip;
    2084             :                 }
    2085             :             }
    2086        2486 :             break;
    2087             : 
    2088           0 :         default:
    2089           0 :             CPLAssert(false);
    2090             :     }
    2091             : }
    2092             : 
    2093             : /************************************************************************/
    2094             : /*                           GDALSwapWordsEx()                          */
    2095             : /************************************************************************/
    2096             : 
    2097             : /**
    2098             :  * Byte swap words in-place.
    2099             :  *
    2100             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2101             :  * a memory array.  No assumption is made that the words being swapped are
    2102             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2103             :  * to determine if the current platform is big endian or little endian.  Use
    2104             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2105             :  * of a function call.
    2106             :  *
    2107             :  * @param pData pointer to start of data buffer.
    2108             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2109             :  * @param nWordCount the number of words to be swapped in this call.
    2110             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2111             :  * the next. For packed buffers this is the same as nWordSize.
    2112             :  * @since GDAL 2.1
    2113             :  */
    2114        6118 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2115             :                                  int nWordSkip)
    2116             : {
    2117        6118 :     GByte *pabyData = static_cast<GByte *>(pData);
    2118       12236 :     while (nWordCount)
    2119             :     {
    2120             :         // Pick-up a multiple of 8 as max chunk size.
    2121        6118 :         const int nWordCountSmall =
    2122        6118 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2123        6118 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2124        6118 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2125        6118 :         nWordCount -= nWordCountSmall;
    2126             :     }
    2127        6118 : }
    2128             : 
    2129             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2130             : namespace
    2131             : {
    2132             : 
    2133             : /************************************************************************/
    2134             : /*                           GDALCopyWordsT()                           */
    2135             : /************************************************************************/
    2136             : /**
    2137             :  * Template function, used to copy data from pSrcData into buffer
    2138             :  * pDstData, with stride nSrcPixelStride in the source data and
    2139             :  * stride nDstPixelStride in the destination data. This template can
    2140             :  * deal with the case where the input data type is real or complex and
    2141             :  * the output is real.
    2142             :  *
    2143             :  * @param pSrcData the source data buffer
    2144             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2145             :  *                      of interest.
    2146             :  * @param pDstData the destination buffer.
    2147             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2148             :  *                      interest.
    2149             :  * @param nWordCount the total number of pixel words to copy
    2150             :  *
    2151             :  * @code
    2152             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2153             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2154             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2155             :  * @endcode
    2156             :  * @note
    2157             :  * This is a private function, and should not be exposed outside of
    2158             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2159             :  */
    2160             : 
    2161             : template <class Tin, class Tout>
    2162    46935824 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2163             :                                          int nSrcPixelStride,
    2164             :                                          Tout *const CPL_RESTRICT pDstData,
    2165             :                                          int nDstPixelStride,
    2166             :                                          GPtrDiff_t nWordCount)
    2167             : {
    2168    46935824 :     decltype(nWordCount) nDstOffset = 0;
    2169             : 
    2170    46935824 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2171    46935824 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2172   597284721 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2173             :     {
    2174   550346517 :         const Tin tValue =
    2175   550346517 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2176   550346517 :         Tout *const pOutPixel =
    2177   550346517 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2178             : 
    2179   550346517 :         GDALCopyWord(tValue, *pOutPixel);
    2180             : 
    2181   550349637 :         nDstOffset += nDstPixelStride;
    2182             :     }
    2183    46939056 : }
    2184             : 
    2185             : template <class Tin, class Tout>
    2186    37991671 : static void inline GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2187             :                                   int nSrcPixelStride,
    2188             :                                   Tout *const CPL_RESTRICT pDstData,
    2189             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    2190             : {
    2191    37991671 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2192             :                           nWordCount);
    2193    37991761 : }
    2194             : 
    2195             : template <class Tin, class Tout>
    2196      270689 : static void inline GDALCopyWordsT_8atatime(
    2197             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2198             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2199             :     GPtrDiff_t nWordCount)
    2200             : {
    2201      270689 :     decltype(nWordCount) nDstOffset = 0;
    2202             : 
    2203      270689 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2204      270689 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2205      270689 :     decltype(nWordCount) n = 0;
    2206      270689 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2207             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2208             :     {
    2209    25827774 :         for (; n < nWordCount - 7; n += 8)
    2210             :         {
    2211    25565694 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2212    25565694 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2213    25565694 :             Tout *const pOutPixels =
    2214    25565694 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2215             : 
    2216    25565694 :             GDALCopy8Words(pInValues, pOutPixels);
    2217             : 
    2218    25560314 :             nDstOffset += 8 * nDstPixelStride;
    2219             :         }
    2220             :     }
    2221      786659 :     for (; n < nWordCount; n++)
    2222             :     {
    2223      515995 :         const Tin tValue =
    2224      515995 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2225      515995 :         Tout *const pOutPixel =
    2226      515995 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2227             : 
    2228      515995 :         GDALCopyWord(tValue, *pOutPixel);
    2229             : 
    2230      521339 :         nDstOffset += nDstPixelStride;
    2231             :     }
    2232      270664 : }
    2233             : 
    2234             : #ifdef HAVE_SSE2
    2235             : 
    2236             : template <class Tout>
    2237       38881 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2238             :                               int nSrcPixelStride,
    2239             :                               Tout *const CPL_RESTRICT pDstData,
    2240             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2241             : {
    2242             :     static_assert(std::is_integral<Tout>::value &&
    2243             :                       sizeof(Tout) == sizeof(uint16_t),
    2244             :                   "Bad Tout");
    2245       38881 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2246             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2247             :     {
    2248       32530 :         decltype(nWordCount) n = 0;
    2249       32530 :         const __m128i xmm_zero = _mm_setzero_si128();
    2250       32530 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2251             :             reinterpret_cast<GByte *>(pDstData);
    2252     1412288 :         for (; n < nWordCount - 15; n += 16)
    2253             :         {
    2254     1379758 :             __m128i xmm = _mm_loadu_si128(
    2255     1379758 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2256     1379758 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2257     1379758 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2258             :             _mm_storeu_si128(
    2259     1379758 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2260             :             _mm_storeu_si128(
    2261     1379758 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2262             :         }
    2263      106649 :         for (; n < nWordCount; n++)
    2264             :         {
    2265       74119 :             pDstData[n] = pSrcData[n];
    2266       32530 :         }
    2267             :     }
    2268             :     else
    2269             :     {
    2270        6351 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2271             :                               nDstPixelStride, nWordCount);
    2272             :     }
    2273       38881 : }
    2274             : 
    2275             : template <>
    2276       26545 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2277             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2278             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2279             : {
    2280       26545 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2281             :                              nDstPixelStride, nWordCount);
    2282       26545 : }
    2283             : 
    2284             : template <>
    2285       12336 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2286             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2287             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2288             : {
    2289       12336 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2290             :                              nDstPixelStride, nWordCount);
    2291       12336 : }
    2292             : 
    2293             : template <class Tout>
    2294    12842068 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2295             :                               int nSrcPixelStride,
    2296             :                               Tout *const CPL_RESTRICT pDstData,
    2297             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2298             : {
    2299             :     static_assert(std::is_integral<Tout>::value &&
    2300             :                       sizeof(Tout) == sizeof(uint32_t),
    2301             :                   "Bad Tout");
    2302    12842068 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2303             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2304             :     {
    2305     6286768 :         decltype(nWordCount) n = 0;
    2306     6286768 :         const __m128i xmm_zero = _mm_setzero_si128();
    2307     6286768 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2308             :             reinterpret_cast<GByte *>(pDstData);
    2309    70542610 :         for (; n < nWordCount - 15; n += 16)
    2310             :         {
    2311    64469252 :             __m128i xmm = _mm_loadu_si128(
    2312    64469252 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2313    64475452 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2314    64509952 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2315    64494352 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2316    64330852 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2317    64267552 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2318    64255852 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2319             :             _mm_storeu_si128(
    2320    64255852 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2321             :             _mm_storeu_si128(
    2322    64255852 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2323             :             _mm_storeu_si128(
    2324    64255852 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2325             :             _mm_storeu_si128(
    2326    64255852 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2327             :         }
    2328    14231699 :         for (; n < nWordCount; n++)
    2329             :         {
    2330     8158301 :             pDstData[n] = pSrcData[n];
    2331     6073358 :         }
    2332             :     }
    2333             :     else
    2334             :     {
    2335     6555350 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2336             :                               nDstPixelStride, nWordCount);
    2337             :     }
    2338    12630768 : }
    2339             : 
    2340             : template <>
    2341         468 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2342             :                     int nSrcPixelStride, GUInt32 *const CPL_RESTRICT pDstData,
    2343             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2344             : {
    2345         468 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2346             :                              nDstPixelStride, nWordCount);
    2347         468 : }
    2348             : 
    2349             : template <>
    2350    12844100 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2351             :                     int nSrcPixelStride, GInt32 *const CPL_RESTRICT pDstData,
    2352             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2353             : {
    2354    12844100 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2355             :                              nDstPixelStride, nWordCount);
    2356    12849000 : }
    2357             : 
    2358             : template <>
    2359     2471350 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2360             :                     int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
    2361             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2362             : {
    2363     2471350 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2364             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2365             :     {
    2366      111905 :         decltype(nWordCount) n = 0;
    2367      111905 :         const __m128i xmm_zero = _mm_setzero_si128();
    2368      111905 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2369             :             reinterpret_cast<GByte *>(pDstData);
    2370     3259990 :         for (; n < nWordCount - 15; n += 16)
    2371             :         {
    2372     3148080 :             __m128i xmm = _mm_loadu_si128(
    2373     3148080 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2374     3148080 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2375     3148080 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2376     3148080 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2377     3148080 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2378     3148080 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2379     3148080 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2380     3148080 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2381     3148080 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2382     3148080 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2383     3148080 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2384     3148080 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2385             :                           xmm0_f);
    2386             :             _mm_storeu_ps(
    2387     3148080 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2388             :             _mm_storeu_ps(
    2389     3148080 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2390             :             _mm_storeu_ps(
    2391     3148080 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2392             :         }
    2393      478444 :         for (; n < nWordCount; n++)
    2394             :         {
    2395      366539 :             pDstData[n] = pSrcData[n];
    2396      111905 :         }
    2397             :     }
    2398             :     else
    2399             :     {
    2400     2359440 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2401             :                               nDstPixelStride, nWordCount);
    2402             :     }
    2403     2471350 : }
    2404             : 
    2405             : template <>
    2406      146754 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2407             :                     int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
    2408             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2409             : {
    2410      146754 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2411             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2412             :     {
    2413      123698 :         decltype(nWordCount) n = 0;
    2414      123698 :         const __m128i xmm_zero = _mm_setzero_si128();
    2415      123698 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2416             :             reinterpret_cast<GByte *>(pDstData);
    2417     1422030 :         for (; n < nWordCount - 15; n += 16)
    2418             :         {
    2419     1298330 :             __m128i xmm = _mm_loadu_si128(
    2420     1298330 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2421     1298330 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2422     1298330 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2423     1298330 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2424     1298330 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2425     1298330 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2426     1298330 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2427             : 
    2428     1298330 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2429     1298330 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2430     1298330 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2431     1298330 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2432     1298330 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2433     1298330 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2434     1298330 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2435     1298330 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2436     1298330 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2437     1298330 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2438     1298330 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2439     1298330 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2440             : 
    2441     1298330 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2442             :                           xmm0_low_d);
    2443             :             _mm_storeu_pd(
    2444     1298330 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2445             :                 xmm0_high_d);
    2446             :             _mm_storeu_pd(
    2447     1298330 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2448             :                 xmm1_low_d);
    2449             :             _mm_storeu_pd(
    2450     1298330 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2451             :                 xmm1_high_d);
    2452             :             _mm_storeu_pd(
    2453     1298330 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2454             :                 xmm2_low_d);
    2455             :             _mm_storeu_pd(
    2456     1298330 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2457             :                 xmm2_high_d);
    2458             :             _mm_storeu_pd(
    2459     1298330 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2460             :                 xmm3_low_d);
    2461             :             _mm_storeu_pd(
    2462     1298330 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2463             :                 xmm3_high_d);
    2464             :         }
    2465      233373 :         for (; n < nWordCount; n++)
    2466             :         {
    2467      109675 :             pDstData[n] = pSrcData[n];
    2468      123698 :         }
    2469             :     }
    2470             :     else
    2471             :     {
    2472       23056 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2473             :                               nDstPixelStride, nWordCount);
    2474             :     }
    2475      146754 : }
    2476             : 
    2477             : template <>
    2478        6040 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2479             :                     int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
    2480             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2481             : {
    2482        6040 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2483             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2484             :     {
    2485        5065 :         decltype(nWordCount) n = 0;
    2486             :         // In SSE2, min_epu16 does not exist, so shift from
    2487             :         // UInt16 to SInt16 to be able to use min_epi16
    2488        5065 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2489        5065 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2490      138561 :         for (; n < nWordCount - 7; n += 8)
    2491             :         {
    2492      133496 :             __m128i xmm = _mm_loadu_si128(
    2493      133496 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2494      133496 :             xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
    2495      133496 :             xmm = _mm_min_epi16(xmm, xmm_m255_shifted);
    2496      133496 :             xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
    2497      133496 :             xmm = _mm_packus_epi16(xmm, xmm);
    2498      133496 :             GDALCopyXMMToInt64(xmm,
    2499      133496 :                                reinterpret_cast<GPtrDiff_t *>(pDstData + n));
    2500             :         }
    2501       16083 :         for (; n < nWordCount; n++)
    2502             :         {
    2503       11018 :             pDstData[n] =
    2504       11018 :                 pSrcData[n] >= 255 ? 255 : static_cast<GByte>(pSrcData[n]);
    2505        5065 :         }
    2506             :     }
    2507             :     else
    2508             :     {
    2509         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2510             :                               nDstPixelStride, nWordCount);
    2511             :     }
    2512        6040 : }
    2513             : 
    2514             : template <>
    2515          49 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2516             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2517             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2518             : {
    2519          49 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2520             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2521             :     {
    2522          43 :         decltype(nWordCount) n = 0;
    2523             :         // In SSE2, min_epu16 does not exist, so shift from
    2524             :         // UInt16 to SInt16 to be able to use min_epi16
    2525          43 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2526          43 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2527         115 :         for (; n < nWordCount - 7; n += 8)
    2528             :         {
    2529          72 :             __m128i xmm = _mm_loadu_si128(
    2530          72 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2531          72 :             xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
    2532          72 :             xmm = _mm_min_epi16(xmm, xmm_32767_shifted);
    2533          72 :             xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
    2534          72 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm);
    2535             :         }
    2536         111 :         for (; n < nWordCount; n++)
    2537             :         {
    2538          68 :             pDstData[n] =
    2539          68 :                 pSrcData[n] >= 32767 ? 32767 : static_cast<GInt16>(pSrcData[n]);
    2540          43 :         }
    2541             :     }
    2542             :     else
    2543             :     {
    2544           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2545             :                               nDstPixelStride, nWordCount);
    2546             :     }
    2547          49 : }
    2548             : 
    2549             : template <>
    2550         342 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2551             :                     int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
    2552             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2553             : {
    2554         342 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2555             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2556             :     {
    2557         336 :         decltype(nWordCount) n = 0;
    2558         336 :         const __m128i xmm_zero = _mm_setzero_si128();
    2559         336 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2560             :             reinterpret_cast<GByte *>(pDstData);
    2561        1286 :         for (; n < nWordCount - 7; n += 8)
    2562             :         {
    2563         950 :             __m128i xmm = _mm_loadu_si128(
    2564         950 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2565         950 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2566         950 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2567         950 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2568         950 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2569         950 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2570             :                           xmm0_f);
    2571             :             _mm_storeu_ps(
    2572         950 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2573             :         }
    2574        1043 :         for (; n < nWordCount; n++)
    2575             :         {
    2576         707 :             pDstData[n] = pSrcData[n];
    2577         336 :         }
    2578             :     }
    2579             :     else
    2580             :     {
    2581           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2582             :                               nDstPixelStride, nWordCount);
    2583             :     }
    2584         342 : }
    2585             : 
    2586             : template <>
    2587         373 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2588             :                     int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
    2589             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2590             : {
    2591         373 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2592             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2593             :     {
    2594         262 :         decltype(nWordCount) n = 0;
    2595         262 :         const __m128i xmm_zero = _mm_setzero_si128();
    2596         262 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2597             :             reinterpret_cast<GByte *>(pDstData);
    2598         507 :         for (; n < nWordCount - 7; n += 8)
    2599             :         {
    2600         245 :             __m128i xmm = _mm_loadu_si128(
    2601         245 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2602         245 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2603         245 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2604             : 
    2605         245 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2606         245 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2607         245 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2608         245 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2609         245 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2610         245 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2611             : 
    2612         245 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2613             :                           xmm0_low_d);
    2614             :             _mm_storeu_pd(
    2615         245 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2616             :                 xmm0_high_d);
    2617             :             _mm_storeu_pd(
    2618         245 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2619             :                 xmm1_low_d);
    2620             :             _mm_storeu_pd(
    2621         245 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2622             :                 xmm1_high_d);
    2623             :         }
    2624         832 :         for (; n < nWordCount; n++)
    2625             :         {
    2626         570 :             pDstData[n] = pSrcData[n];
    2627         262 :         }
    2628             :     }
    2629             :     else
    2630             :     {
    2631         111 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2632             :                               nDstPixelStride, nWordCount);
    2633             :     }
    2634         373 : }
    2635             : 
    2636             : template <>
    2637        3091 : void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    2638             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2639             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2640             : {
    2641        3091 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2642             :                             nDstPixelStride, nWordCount);
    2643        3091 : }
    2644             : 
    2645             : #endif  // HAVE_SSE2
    2646             : 
    2647             : template <>
    2648      190103 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2649             :                     int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
    2650             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2651             : {
    2652      190103 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2653             :                             nDstPixelStride, nWordCount);
    2654      190103 : }
    2655             : 
    2656             : template <>
    2657       15796 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2658             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2659             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2660             : {
    2661       15796 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2662             :                             nDstPixelStride, nWordCount);
    2663       15796 : }
    2664             : 
    2665             : template <>
    2666       61703 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2667             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2668             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2669             : {
    2670       61703 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2671             :                             nDstPixelStride, nWordCount);
    2672       61700 : }
    2673             : 
    2674             : /************************************************************************/
    2675             : /*                   GDALCopyWordsComplexT()                            */
    2676             : /************************************************************************/
    2677             : /**
    2678             :  * Template function, used to copy data from pSrcData into buffer
    2679             :  * pDstData, with stride nSrcPixelStride in the source data and
    2680             :  * stride nDstPixelStride in the destination data. Deals with the
    2681             :  * complex case, where input is complex and output is complex.
    2682             :  *
    2683             :  * @param pSrcData the source data buffer
    2684             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2685             :  *                      of interest.
    2686             :  * @param pDstData the destination buffer.
    2687             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2688             :  *                      interest.
    2689             :  * @param nWordCount the total number of pixel words to copy
    2690             :  *
    2691             :  */
    2692             : template <class Tin, class Tout>
    2693       96717 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    2694             :                                   int nSrcPixelStride,
    2695             :                                   Tout *const CPL_RESTRICT pDstData,
    2696             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    2697             : {
    2698       96717 :     decltype(nWordCount) nDstOffset = 0;
    2699       96717 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2700       96717 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2701             : 
    2702     5241771 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2703             :     {
    2704     5145049 :         const Tin *const pPixelIn =
    2705     5145049 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    2706     5145049 :         Tout *const pPixelOut =
    2707     5145049 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2708             : 
    2709     5145049 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    2710     5145049 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    2711             : 
    2712     5145049 :         nDstOffset += nDstPixelStride;
    2713             :     }
    2714       96717 : }
    2715             : 
    2716             : /************************************************************************/
    2717             : /*                   GDALCopyWordsComplexOutT()                         */
    2718             : /************************************************************************/
    2719             : /**
    2720             :  * Template function, used to copy data from pSrcData into buffer
    2721             :  * pDstData, with stride nSrcPixelStride in the source data and
    2722             :  * stride nDstPixelStride in the destination data. Deals with the
    2723             :  * case where the value is real coming in, but complex going out.
    2724             :  *
    2725             :  * @param pSrcData the source data buffer
    2726             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2727             :  *                      of interest, in bytes.
    2728             :  * @param pDstData the destination buffer.
    2729             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2730             :  *                      interest, in bytes.
    2731             :  * @param nWordCount the total number of pixel words to copy
    2732             :  *
    2733             :  */
    2734             : template <class Tin, class Tout>
    2735        3877 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    2736             :                                      int nSrcPixelStride,
    2737             :                                      Tout *const CPL_RESTRICT pDstData,
    2738             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    2739             : {
    2740        3877 :     decltype(nWordCount) nDstOffset = 0;
    2741             : 
    2742        3877 :     const Tout tOutZero = static_cast<Tout>(0);
    2743             : 
    2744        3877 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2745        3877 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2746             : 
    2747     1099414 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2748             :     {
    2749     1095537 :         const Tin tValue =
    2750     1095537 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    2751     1095537 :         Tout *const pPixelOut =
    2752     1095537 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2753     1095537 :         GDALCopyWord(tValue, *pPixelOut);
    2754             : 
    2755     1095537 :         pPixelOut[1] = tOutZero;
    2756             : 
    2757     1095537 :         nDstOffset += nDstPixelStride;
    2758             :     }
    2759        3877 : }
    2760             : 
    2761             : /************************************************************************/
    2762             : /*                           GDALCopyWordsFromT()                       */
    2763             : /************************************************************************/
    2764             : /**
    2765             :  * Template driver function. Given the input type T, call the appropriate
    2766             :  * GDALCopyWordsT function template for the desired output type. You should
    2767             :  * never call this function directly (call GDALCopyWords instead).
    2768             :  *
    2769             :  * @param pSrcData source data buffer
    2770             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    2771             :  * @param bInComplex input is complex
    2772             :  * @param pDstData destination data buffer
    2773             :  * @param eDstType destination data type
    2774             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    2775             :  * @param nWordCount number of pixel words to be copied
    2776             :  */
    2777             : template <class T>
    2778    53872056 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    2779             :                                int nSrcPixelStride, bool bInComplex,
    2780             :                                void *CPL_RESTRICT pDstData,
    2781             :                                GDALDataType eDstType, int nDstPixelStride,
    2782             :                                GPtrDiff_t nWordCount)
    2783             : {
    2784    53872056 :     switch (eDstType)
    2785             :     {
    2786     4580243 :         case GDT_Byte:
    2787     4580243 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2788             :                            static_cast<unsigned char *>(pDstData),
    2789             :                            nDstPixelStride, nWordCount);
    2790     4580301 :             break;
    2791         809 :         case GDT_Int8:
    2792         809 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2793             :                            static_cast<signed char *>(pDstData),
    2794             :                            nDstPixelStride, nWordCount);
    2795         809 :             break;
    2796      105082 :         case GDT_UInt16:
    2797      105082 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2798             :                            static_cast<unsigned short *>(pDstData),
    2799             :                            nDstPixelStride, nWordCount);
    2800      105076 :             break;
    2801     4127203 :         case GDT_Int16:
    2802     4127203 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2803             :                            static_cast<short *>(pDstData), nDstPixelStride,
    2804             :                            nWordCount);
    2805     4127203 :             break;
    2806        9486 :         case GDT_UInt32:
    2807        9486 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2808             :                            static_cast<unsigned int *>(pDstData),
    2809             :                            nDstPixelStride, nWordCount);
    2810        9486 :             break;
    2811    26049925 :         case GDT_Int32:
    2812    26049925 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2813             :                            static_cast<int *>(pDstData), nDstPixelStride,
    2814             :                            nWordCount);
    2815    26055626 :             break;
    2816         856 :         case GDT_UInt64:
    2817         856 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2818             :                            static_cast<std::uint64_t *>(pDstData),
    2819             :                            nDstPixelStride, nWordCount);
    2820         856 :             break;
    2821        5170 :         case GDT_Int64:
    2822        5170 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2823             :                            static_cast<std::int64_t *>(pDstData),
    2824             :                            nDstPixelStride, nWordCount);
    2825        5170 :             break;
    2826         942 :         case GDT_Float16:
    2827         942 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2828             :                            static_cast<GFloat16 *>(pDstData), nDstPixelStride,
    2829             :                            nWordCount);
    2830         942 :             break;
    2831     3695809 :         case GDT_Float32:
    2832     3695809 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2833             :                            static_cast<float *>(pDstData), nDstPixelStride,
    2834             :                            nWordCount);
    2835     3695809 :             break;
    2836    15194189 :         case GDT_Float64:
    2837    15194189 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2838             :                            static_cast<double *>(pDstData), nDstPixelStride,
    2839             :                            nWordCount);
    2840    15194249 :             break;
    2841       94123 :         case GDT_CInt16:
    2842       94123 :             if (bInComplex)
    2843             :             {
    2844       92870 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2845             :                                       static_cast<short *>(pDstData),
    2846             :                                       nDstPixelStride, nWordCount);
    2847             :             }
    2848             :             else  // input is not complex, so we need to promote to a complex
    2849             :                   // buffer
    2850             :             {
    2851        1253 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2852             :                                          static_cast<short *>(pDstData),
    2853             :                                          nDstPixelStride, nWordCount);
    2854             :             }
    2855       94123 :             break;
    2856        1052 :         case GDT_CInt32:
    2857        1052 :             if (bInComplex)
    2858             :             {
    2859         421 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2860             :                                       static_cast<int *>(pDstData),
    2861             :                                       nDstPixelStride, nWordCount);
    2862             :             }
    2863             :             else  // input is not complex, so we need to promote to a complex
    2864             :                   // buffer
    2865             :             {
    2866         631 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2867             :                                          static_cast<int *>(pDstData),
    2868             :                                          nDstPixelStride, nWordCount);
    2869             :             }
    2870        1052 :             break;
    2871         281 :         case GDT_CFloat16:
    2872         281 :             if (bInComplex)
    2873             :             {
    2874          16 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2875             :                                       static_cast<GFloat16 *>(pDstData),
    2876             :                                       nDstPixelStride, nWordCount);
    2877             :             }
    2878             :             else  // input is not complex, so we need to promote to a complex
    2879             :                   // buffer
    2880             :             {
    2881         265 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2882             :                                          static_cast<GFloat16 *>(pDstData),
    2883             :                                          nDstPixelStride, nWordCount);
    2884             :             }
    2885         281 :             break;
    2886        3359 :         case GDT_CFloat32:
    2887        3359 :             if (bInComplex)
    2888             :             {
    2889        2564 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2890             :                                       static_cast<float *>(pDstData),
    2891             :                                       nDstPixelStride, nWordCount);
    2892             :             }
    2893             :             else  // input is not complex, so we need to promote to a complex
    2894             :                   // buffer
    2895             :             {
    2896         795 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2897             :                                          static_cast<float *>(pDstData),
    2898             :                                          nDstPixelStride, nWordCount);
    2899             :             }
    2900        3359 :             break;
    2901        1779 :         case GDT_CFloat64:
    2902        1779 :             if (bInComplex)
    2903             :             {
    2904         846 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2905             :                                       static_cast<double *>(pDstData),
    2906             :                                       nDstPixelStride, nWordCount);
    2907             :             }
    2908             :             else  // input is not complex, so we need to promote to a complex
    2909             :                   // buffer
    2910             :             {
    2911         933 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2912             :                                          static_cast<double *>(pDstData),
    2913             :                                          nDstPixelStride, nWordCount);
    2914             :             }
    2915        1779 :             break;
    2916           0 :         case GDT_Unknown:
    2917             :         case GDT_TypeCount:
    2918           0 :             CPLAssert(false);
    2919             :     }
    2920    53877811 : }
    2921             : 
    2922             : }  // end anonymous namespace
    2923             : 
    2924             : /************************************************************************/
    2925             : /*                          GDALReplicateWord()                         */
    2926             : /************************************************************************/
    2927             : 
    2928             : template <class T>
    2929      530285 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    2930             :                                GPtrDiff_t nWordCount)
    2931             : {
    2932      530285 :     const T valSet = *static_cast<const T *>(pDstData);
    2933      530285 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    2934             :     {
    2935      500546 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    2936    20668583 :         while (nWordCount >= 4)
    2937             :         {
    2938    20168093 :             nWordCount -= 4;
    2939    20168093 :             pDstPtr[0] = valSet;
    2940    20168093 :             pDstPtr[1] = valSet;
    2941    20168093 :             pDstPtr[2] = valSet;
    2942    20168093 :             pDstPtr[3] = valSet;
    2943    20168093 :             pDstPtr += 4;
    2944             :         }
    2945     1269777 :         while (nWordCount > 0)
    2946             :         {
    2947      769231 :             --nWordCount;
    2948      769231 :             *pDstPtr = valSet;
    2949      769231 :             pDstPtr++;
    2950             :         }
    2951             :     }
    2952             :     else
    2953             :     {
    2954       29751 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    2955     1040338 :         while (nWordCount > 0)
    2956             :         {
    2957     1010587 :             --nWordCount;
    2958     1010587 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    2959     1010587 :             pabyDstPtr += nDstPixelStride;
    2960             :         }
    2961             :     }
    2962      530285 : }
    2963             : 
    2964      912783 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    2965             :                               GDALDataType eSrcType,
    2966             :                               void *CPL_RESTRICT pDstData,
    2967             :                               GDALDataType eDstType, int nDstPixelStride,
    2968             :                               GPtrDiff_t nWordCount)
    2969             : {
    2970             :     /* -----------------------------------------------------------------------
    2971             :      */
    2972             :     /* Special case when the source data is always the same value */
    2973             :     /* (for VRTSourcedRasterBand::IRasterIO and
    2974             :      * VRTDerivedRasterBand::IRasterIO*/
    2975             :     /*  for example) */
    2976             :     /* -----------------------------------------------------------------------
    2977             :      */
    2978             :     // Let the general translation case do the necessary conversions
    2979             :     // on the first destination element.
    2980      912783 :     GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    2981             : 
    2982             :     // Now copy the first element to the nWordCount - 1 following destination
    2983             :     // elements.
    2984      912810 :     nWordCount--;
    2985      912810 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    2986             : 
    2987      912810 :     switch (eDstType)
    2988             :     {
    2989      382426 :         case GDT_Byte:
    2990             :         case GDT_Int8:
    2991             :         {
    2992      382426 :             if (nDstPixelStride == 1)
    2993             :             {
    2994      342121 :                 if (nWordCount > 0)
    2995      342121 :                     memset(pabyDstWord,
    2996      342121 :                            *reinterpret_cast<const GByte *>(pDstData),
    2997             :                            nWordCount);
    2998             :             }
    2999             :             else
    3000             :             {
    3001       40305 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    3002    23942200 :                 while (nWordCount > 0)
    3003             :                 {
    3004    23901900 :                     --nWordCount;
    3005    23901900 :                     *pabyDstWord = valSet;
    3006    23901900 :                     pabyDstWord += nDstPixelStride;
    3007             :                 }
    3008             :             }
    3009      382426 :             break;
    3010             :         }
    3011             : 
    3012             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    3013             :     case enum_type:                                                            \
    3014             :     {                                                                          \
    3015             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    3016             :         break;                                                                 \
    3017             :     }
    3018             : 
    3019        1723 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    3020      169649 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    3021          56 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3022      300220 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3023          21 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3024        1024 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3025           0 :             CASE_DUPLICATE_SIMPLE(GDT_Float16, GFloat16)
    3026       52459 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3027        5120 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3028             : 
    3029             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3030             :     case enum_type:                                                            \
    3031             :     {                                                                          \
    3032             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3033             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3034             :         while (nWordCount > 0)                                                 \
    3035             :         {                                                                      \
    3036             :             --nWordCount;                                                      \
    3037             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3038             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3039             :             pabyDstWord += nDstPixelStride;                                    \
    3040             :         }                                                                      \
    3041             :         break;                                                                 \
    3042             :     }
    3043             : 
    3044         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3045         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3046           0 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat16, GFloat16)
    3047         784 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3048         784 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3049             : 
    3050           0 :         case GDT_Unknown:
    3051             :         case GDT_TypeCount:
    3052           0 :             CPLAssert(false);
    3053             :     }
    3054      912744 : }
    3055             : 
    3056             : /************************************************************************/
    3057             : /*                        GDALUnrolledCopy()                            */
    3058             : /************************************************************************/
    3059             : 
    3060             : template <class T, int srcStride, int dstStride>
    3061     3135680 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3062             :                                            const T *CPL_RESTRICT pSrc,
    3063             :                                            GPtrDiff_t nIters)
    3064             : {
    3065     3135680 :     if (nIters >= 16)
    3066             :     {
    3067   135333105 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3068             :         {
    3069   132324519 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3070   132324519 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3071   132324519 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3072   132324519 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3073   132324519 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3074   132324519 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3075   132324519 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3076   132324519 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3077   132324519 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3078   132324519 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3079   132324519 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3080   132324519 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3081   132324519 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3082   132324519 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3083   132324519 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3084   132324519 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3085   132324519 :             pDest += 16 * dstStride;
    3086   132324519 :             pSrc += 16 * srcStride;
    3087             :         }
    3088     3008585 :         nIters = nIters % 16;
    3089             :     }
    3090     5377278 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3091             :     {
    3092     2241600 :         pDest[i * dstStride] = *pSrc;
    3093     2241600 :         pSrc += srcStride;
    3094             :     }
    3095     3135680 : }
    3096             : 
    3097             : template <class T, int srcStride, int dstStride>
    3098     3129579 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3099             :                                     const T *CPL_RESTRICT pSrc,
    3100             :                                     GPtrDiff_t nIters)
    3101             : {
    3102     3129579 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3103     3129596 : }
    3104             : 
    3105             : #ifdef HAVE_SSE2
    3106             : 
    3107             : template <>
    3108      352916 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3109             :                                    const GByte *CPL_RESTRICT pSrc,
    3110             :                                    GPtrDiff_t nIters)
    3111             : {
    3112      352916 :     decltype(nIters) i = 0;
    3113      352916 :     if (nIters > 16)
    3114             :     {
    3115      194663 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3116             :         // If we were sure that there would always be 1 trailing byte, we could
    3117             :         // check against nIters - 15
    3118     2988090 :         for (; i < nIters - 16; i += 16)
    3119             :         {
    3120             :             __m128i xmm0 =
    3121     2793430 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3122             :             __m128i xmm1 =
    3123     5586860 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3124             :             // Set higher 8bit of each int16 packed word to 0
    3125     2793430 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3126     2793430 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3127             :             // Pack int16 to uint8 and merge back both vector
    3128     2793430 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3129             : 
    3130             :             // Store result
    3131     2793430 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3132             : 
    3133     2793430 :             pSrc += 2 * 16;
    3134             :         }
    3135             :     }
    3136     4619880 :     for (; i < nIters; i++)
    3137             :     {
    3138     4266960 :         pDest[i] = *pSrc;
    3139     4266960 :         pSrc += 2;
    3140             :     }
    3141      352916 : }
    3142             : 
    3143             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3144             : 
    3145             : template <>
    3146      191860 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3147             :                                    const GByte *CPL_RESTRICT pSrc,
    3148             :                                    GPtrDiff_t nIters)
    3149             : {
    3150      191860 :     if (nIters > 16 && CPLHaveRuntimeSSSE3())
    3151             :     {
    3152      185760 :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3153             :     }
    3154             :     else
    3155             :     {
    3156        6100 :         GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
    3157             :     }
    3158      191860 : }
    3159             : 
    3160             : #endif
    3161             : 
    3162             : template <>
    3163      106241 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    3164             :                                    const GByte *CPL_RESTRICT pSrc,
    3165             :                                    GPtrDiff_t nIters)
    3166             : {
    3167      106241 :     decltype(nIters) i = 0;
    3168      106241 :     if (nIters > 16)
    3169             :     {
    3170      100948 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    3171             :         // If we were sure that there would always be 3 trailing bytes, we could
    3172             :         // check against nIters - 15
    3173     9914600 :         for (; i < nIters - 16; i += 16)
    3174             :         {
    3175             :             __m128i xmm0 =
    3176     9813290 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3177             :             __m128i xmm1 =
    3178     9813290 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3179             :             __m128i xmm2 =
    3180     9813290 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3181             :             __m128i xmm3 =
    3182    19626600 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    3183             :             // Set higher 24bit of each int32 packed word to 0
    3184     9813290 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3185     9813290 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3186     9813290 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    3187     9813290 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    3188             :             // Pack int32 to int16
    3189     9813590 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    3190     9813510 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    3191             :             // Pack int16 to uint8
    3192     9813650 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    3193             : 
    3194             :             // Store result
    3195     9813650 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3196             : 
    3197     9813650 :             pSrc += 4 * 16;
    3198             :         }
    3199             :     }
    3200     1136160 :     for (; i < nIters; i++)
    3201             :     {
    3202     1029550 :         pDest[i] = *pSrc;
    3203     1029550 :         pSrc += 4;
    3204             :     }
    3205      106604 : }
    3206             : #endif  // HAVE_SSE2
    3207             : 
    3208             : /************************************************************************/
    3209             : /*                         GDALFastCopy()                               */
    3210             : /************************************************************************/
    3211             : 
    3212             : template <class T>
    3213    40035200 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    3214             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    3215             :                                 GPtrDiff_t nIters)
    3216             : {
    3217    40035200 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    3218    40035200 :     if (nIters == 1)
    3219             :     {
    3220    22098510 :         *pDest = *pSrc;
    3221             :     }
    3222    17936753 :     else if (nDestStride == sizeofT)
    3223             :     {
    3224    14839784 :         if (nSrcStride == sizeofT)
    3225             :         {
    3226    14073987 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    3227             :         }
    3228      765756 :         else if (nSrcStride == 2 * sizeofT)
    3229             :         {
    3230      355869 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    3231             :         }
    3232      409887 :         else if (nSrcStride == 3 * sizeofT)
    3233             :         {
    3234      288432 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    3235             :         }
    3236      121455 :         else if (nSrcStride == 4 * sizeofT)
    3237             :         {
    3238      110223 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    3239             :         }
    3240             :         else
    3241             :         {
    3242    12966620 :             while (nIters-- > 0)
    3243             :             {
    3244    12955430 :                 *pDest = *pSrc;
    3245    12955430 :                 pSrc += nSrcStride / sizeofT;
    3246    12955430 :                 pDest++;
    3247             :             }
    3248             :         }
    3249             :     }
    3250     3096979 :     else if (nSrcStride == sizeofT)
    3251             :     {
    3252     3091245 :         if (nDestStride == 2 * sizeofT)
    3253             :         {
    3254      158659 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    3255             :         }
    3256     2932580 :         else if (nDestStride == 3 * sizeofT)
    3257             :         {
    3258     2144782 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    3259             :         }
    3260      787802 :         else if (nDestStride == 4 * sizeofT)
    3261             :         {
    3262      722632 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    3263             :         }
    3264             :         else
    3265             :         {
    3266    12883710 :             while (nIters-- > 0)
    3267             :             {
    3268    12818500 :                 *pDest = *pSrc;
    3269    12818500 :                 pSrc++;
    3270    12818500 :                 pDest += nDestStride / sizeofT;
    3271             :             }
    3272             :         }
    3273             :     }
    3274             :     else
    3275             :     {
    3276     1212836 :         while (nIters-- > 0)
    3277             :         {
    3278     1207102 :             *pDest = *pSrc;
    3279     1207102 :             pSrc += nSrcStride / sizeofT;
    3280     1207102 :             pDest += nDestStride / sizeofT;
    3281             :         }
    3282             :     }
    3283    40034700 : }
    3284             : 
    3285             : /************************************************************************/
    3286             : /*                         GDALFastCopyByte()                           */
    3287             : /************************************************************************/
    3288             : 
    3289      326246 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    3290             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    3291             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    3292             : {
    3293      326246 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    3294             :                  nWordCount);
    3295      326246 : }
    3296             : 
    3297             : /************************************************************************/
    3298             : /*                           GDALCopyWords()                            */
    3299             : /************************************************************************/
    3300             : 
    3301             : /**
    3302             :  * Copy pixel words from buffer to buffer.
    3303             :  *
    3304             :  * @see GDALCopyWords64()
    3305             :  */
    3306    86959800 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    3307             :                                GDALDataType eSrcType, int nSrcPixelStride,
    3308             :                                void *CPL_RESTRICT pDstData,
    3309             :                                GDALDataType eDstType, int nDstPixelStride,
    3310             :                                int nWordCount)
    3311             : {
    3312    86959800 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    3313             :                     nDstPixelStride, nWordCount);
    3314    86958200 : }
    3315             : 
    3316             : /************************************************************************/
    3317             : /*                          GDALCopyWords64()                           */
    3318             : /************************************************************************/
    3319             : 
    3320             : /**
    3321             :  * Copy pixel words from buffer to buffer.
    3322             :  *
    3323             :  * This function is used to copy pixel word values from one memory buffer
    3324             :  * to another, with support for conversion between data types, and differing
    3325             :  * step factors. The data type conversion is done using the following
    3326             :  * rules:
    3327             :  * <ul>
    3328             :  * <li>Values assigned to a lower range integer type are clipped. For
    3329             :  * instance assigning GDT_Int16 values to a GDT_Byte buffer will cause values
    3330             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    3331             :  * </li>
    3332             :  * <li>
    3333             :  * Assignment from floating point to integer rounds to closest integer.
    3334             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    3335             :  * smallest integer. NaN is mapped to 0.
    3336             :  * </li>
    3337             :  * <li>
    3338             :  * Assignment from non-complex to complex will result in the imaginary part
    3339             :  * being set to zero on output.
    3340             :  * </li>
    3341             :  * <li> Assignment from complex to
    3342             :  * non-complex will result in the complex portion being lost and the real
    3343             :  * component being preserved (<i>not magnitude!</i>).
    3344             :  * </li>
    3345             :  * </ul>
    3346             :  *
    3347             :  * No assumptions are made about the source or destination words occurring
    3348             :  * on word boundaries.  It is assumed that all values are in native machine
    3349             :  * byte order.
    3350             :  *
    3351             :  * @param pSrcData Pointer to source data to be converted.
    3352             :  * @param eSrcType the source data type (see GDALDataType enum)
    3353             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    3354             :  * in bytes
    3355             :  * @param pDstData Pointer to buffer where destination data should go
    3356             :  * @param eDstType the destination data type (see GDALDataType enum)
    3357             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    3358             :  * words), in bytes
    3359             :  * @param nWordCount number of words to be copied
    3360             :  *
    3361             :  * @note
    3362             :  * When adding a new data type to GDAL, you must do the following to
    3363             :  * support it properly within the GDALCopyWords function:
    3364             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    3365             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    3366             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    3367             :  *    This should call the appropriate GDALCopyWordsT template.
    3368             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    3369             :  *    above namespace. This will ensure that any conversion issues are
    3370             :  *    handled (cases like the float -> int32 case, where the min/max)
    3371             :  *    values are subject to roundoff error.
    3372             :  */
    3373             : 
    3374   108600000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    3375             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    3376             :                                  void *CPL_RESTRICT pDstData,
    3377             :                                  GDALDataType eDstType, int nDstPixelStride,
    3378             :                                  GPtrDiff_t nWordCount)
    3379             : 
    3380             : {
    3381             :     // On platforms where alignment matters, be careful
    3382   108600000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    3383   108595000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    3384   108601000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    3385             :     {
    3386           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    3387             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    3388             :                  "argument");
    3389           2 :         return;
    3390             :     }
    3391   108601000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    3392    58399000 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    3393    58401200 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    3394    58400600 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    3395    58400100 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    3396             :     {
    3397         905 :         if (eSrcType == eDstType)
    3398             :         {
    3399       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3400             :             {
    3401       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3402             :                        static_cast<const GByte *>(pSrcData) +
    3403       34000 :                            nSrcPixelStride * i,
    3404             :                        nDstDataTypeSize);
    3405             :             }
    3406             :         }
    3407             :         else
    3408             :         {
    3409         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    3410             :             {
    3411             :                 return ptr +
    3412         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    3413         210 :                         align);
    3414             :             };
    3415             : 
    3416             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    3417             :             // be sure to get correctly aligned pointer.
    3418         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    3419             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    3420             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    3421             :             GByte *pabySrcBuffer =
    3422         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    3423             :             GByte *pabyDstBuffer =
    3424         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    3425        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3426             :             {
    3427        3255 :                 memcpy(pabySrcBuffer,
    3428             :                        static_cast<const GByte *>(pSrcData) +
    3429        3255 :                            nSrcPixelStride * i,
    3430             :                        nSrcDataTypeSize);
    3431        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    3432             :                                 eDstType, 0, 1);
    3433        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3434             :                        pabyDstBuffer, nDstDataTypeSize);
    3435             :             }
    3436             :         }
    3437         905 :         return;
    3438             :     }
    3439             : 
    3440             :     // Deal with the case where we're replicating a single word into the
    3441             :     // provided buffer
    3442   108600000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    3443             :     {
    3444      912795 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    3445             :                           nDstPixelStride, nWordCount);
    3446      912772 :         return;
    3447             :     }
    3448             : 
    3449   107688000 :     if (eSrcType == eDstType)
    3450             :     {
    3451    53953300 :         if (eSrcType == GDT_Byte || eSrcType == GDT_Int8)
    3452             :         {
    3453    19047800 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    3454             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    3455             :                          nWordCount);
    3456    19046200 :             return;
    3457             :         }
    3458             : 
    3459    34905500 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    3460    20668400 :             (nDstPixelStride % 2) == 0)
    3461             :         {
    3462    20668400 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    3463             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    3464             :                          nWordCount);
    3465    20668200 :             return;
    3466             :         }
    3467             : 
    3468    14237100 :         if (nWordCount == 1)
    3469             :         {
    3470             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    3471             :             // Avoid false positives...
    3472             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    3473             : #else
    3474    13852500 :             if (nSrcDataTypeSize == 2)
    3475           0 :                 memcpy(pDstData, pSrcData, 2);
    3476    13852500 :             else if (nSrcDataTypeSize == 4)
    3477    13809100 :                 memcpy(pDstData, pSrcData, 4);
    3478       43451 :             else if (nSrcDataTypeSize == 8)
    3479       26931 :                 memcpy(pDstData, pSrcData, 8);
    3480             :             else /* if( eSrcType == GDT_CFloat64 ) */
    3481       16520 :                 memcpy(pDstData, pSrcData, 16);
    3482             : #endif
    3483    13852500 :             return;
    3484             :         }
    3485             : 
    3486             :         // Let memcpy() handle the case where we're copying a packed buffer
    3487             :         // of pixels.
    3488      384627 :         if (nSrcPixelStride == nDstPixelStride)
    3489             :         {
    3490      256838 :             if (nSrcPixelStride == nSrcDataTypeSize)
    3491             :             {
    3492      256767 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    3493      256767 :                 return;
    3494             :             }
    3495             :         }
    3496             :     }
    3497             : 
    3498             :     // Handle the more general case -- deals with conversion of data types
    3499             :     // directly.
    3500    53862200 :     switch (eSrcType)
    3501             :     {
    3502    15506500 :         case GDT_Byte:
    3503    15506500 :             GDALCopyWordsFromT<unsigned char>(
    3504             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    3505             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3506    15509500 :             break;
    3507        1254 :         case GDT_Int8:
    3508        1254 :             GDALCopyWordsFromT<signed char>(
    3509             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    3510             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3511        1254 :             break;
    3512       53350 :         case GDT_UInt16:
    3513       53350 :             GDALCopyWordsFromT<unsigned short>(
    3514             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    3515             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3516       53350 :             break;
    3517     4350250 :         case GDT_Int16:
    3518     4350250 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3519             :                                       nSrcPixelStride, false, pDstData,
    3520             :                                       eDstType, nDstPixelStride, nWordCount);
    3521     4350270 :             break;
    3522        7094 :         case GDT_UInt32:
    3523        7094 :             GDALCopyWordsFromT<unsigned int>(
    3524             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    3525             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3526        7094 :             break;
    3527    12255000 :         case GDT_Int32:
    3528    12255000 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3529             :                                     nSrcPixelStride, false, pDstData, eDstType,
    3530             :                                     nDstPixelStride, nWordCount);
    3531    12255000 :             break;
    3532        1663 :         case GDT_UInt64:
    3533        1663 :             GDALCopyWordsFromT<std::uint64_t>(
    3534             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    3535             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3536        1663 :             break;
    3537       10994 :         case GDT_Int64:
    3538       10994 :             GDALCopyWordsFromT<std::int64_t>(
    3539             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    3540             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3541       10994 :             break;
    3542        1169 :         case GDT_Float16:
    3543        1169 :             GDALCopyWordsFromT<GFloat16>(
    3544             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, false,
    3545             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    3546        1169 :             break;
    3547      395108 :         case GDT_Float32:
    3548      395108 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3549             :                                       nSrcPixelStride, false, pDstData,
    3550             :                                       eDstType, nDstPixelStride, nWordCount);
    3551      395104 :             break;
    3552    20634800 :         case GDT_Float64:
    3553    20634800 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3554             :                                        nSrcPixelStride, false, pDstData,
    3555             :                                        eDstType, nDstPixelStride, nWordCount);
    3556    20634800 :             break;
    3557      478156 :         case GDT_CInt16:
    3558      478156 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3559             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3560             :                                       nDstPixelStride, nWordCount);
    3561      478156 :             break;
    3562         571 :         case GDT_CInt32:
    3563         571 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3564             :                                     nSrcPixelStride, true, pDstData, eDstType,
    3565             :                                     nDstPixelStride, nWordCount);
    3566         571 :             break;
    3567         436 :         case GDT_CFloat16:
    3568         436 :             GDALCopyWordsFromT<GFloat16>(
    3569             :                 static_cast<const GFloat16 *>(pSrcData), nSrcPixelStride, true,
    3570             :                 pDstData, eDstType, nDstPixelStride, nWordCount);
    3571         436 :             break;
    3572        1577 :         case GDT_CFloat32:
    3573        1577 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3574             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3575             :                                       nDstPixelStride, nWordCount);
    3576        1577 :             break;
    3577      174239 :         case GDT_CFloat64:
    3578      174239 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3579             :                                        nSrcPixelStride, true, pDstData,
    3580             :                                        eDstType, nDstPixelStride, nWordCount);
    3581      174239 :             break;
    3582           0 :         case GDT_Unknown:
    3583             :         case GDT_TypeCount:
    3584           0 :             CPLAssert(false);
    3585             :     }
    3586             : }
    3587             : 
    3588             : /************************************************************************/
    3589             : /*                            GDALCopyBits()                            */
    3590             : /************************************************************************/
    3591             : 
    3592             : /**
    3593             :  * Bitwise word copying.
    3594             :  *
    3595             :  * A function for moving sets of partial bytes around.  Loosely
    3596             :  * speaking this is a bitwise analog to GDALCopyWords().
    3597             :  *
    3598             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    3599             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    3600             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    3601             :  * and nDstOffset are the offset into the source and destination buffers
    3602             :  * to start at, also measured in bits.
    3603             :  *
    3604             :  * All bit offsets are assumed to start from the high order bit in a byte
    3605             :  * (i.e. most significant bit first).  Currently this function is not very
    3606             :  * optimized, but it may be improved for some common cases in the future
    3607             :  * as needed.
    3608             :  *
    3609             :  * @param pabySrcData the source data buffer.
    3610             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    3611             :  * first word to copy.
    3612             :  * @param nSrcStep the offset in bits from the start one source word to the
    3613             :  * start of the next.
    3614             :  * @param pabyDstData the destination data buffer.
    3615             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    3616             :  * first word to copy over.
    3617             :  * @param nDstStep the offset in bits from the start one word to the
    3618             :  * start of the next.
    3619             :  * @param nBitCount the number of bits in a word to be copied.
    3620             :  * @param nStepCount the number of words to copy.
    3621             :  */
    3622             : 
    3623           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    3624             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    3625             :                   int nBitCount, int nStepCount)
    3626             : 
    3627             : {
    3628           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    3629             : 
    3630           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    3631             :     {
    3632           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    3633             :         {
    3634           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    3635           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    3636             :             else
    3637           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    3638             : 
    3639           0 :             nSrcOffset++;
    3640           0 :             nDstOffset++;
    3641             :         }
    3642             : 
    3643           0 :         nSrcOffset += (nSrcStep - nBitCount);
    3644           0 :         nDstOffset += (nDstStep - nBitCount);
    3645             :     }
    3646             : }
    3647             : 
    3648             : /************************************************************************/
    3649             : /*                    GDALGetBestOverviewLevel()                        */
    3650             : /*                                                                      */
    3651             : /* Returns the best overview level to satisfy the query or -1 if none   */
    3652             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    3653             : /* returning a valid overview level                                     */
    3654             : /************************************************************************/
    3655             : 
    3656           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    3657             :                                  int &nXSize, int &nYSize, int nBufXSize,
    3658             :                                  int nBufYSize)
    3659             : {
    3660           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    3661           0 :                                          nBufXSize, nBufYSize, nullptr);
    3662             : }
    3663             : 
    3664      523804 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    3665             :                                   int &nYOff, int &nXSize, int &nYSize,
    3666             :                                   int nBufXSize, int nBufYSize,
    3667             :                                   GDALRasterIOExtraArg *psExtraArg)
    3668             : {
    3669      523804 :     if (psExtraArg != nullptr && psExtraArg->nVersion > 1 &&
    3670      523804 :         psExtraArg->bUseOnlyThisScale)
    3671         109 :         return -1;
    3672             :     /* -------------------------------------------------------------------- */
    3673             :     /*      Compute the desired downsampling factor.  It is                 */
    3674             :     /*      based on the least reduced axis, and represents the number      */
    3675             :     /*      of source pixels to one destination pixel.                      */
    3676             :     /* -------------------------------------------------------------------- */
    3677      523695 :     const double dfDesiredDownsamplingFactor =
    3678      523695 :         ((nXSize / static_cast<double>(nBufXSize)) <
    3679      361357 :              (nYSize / static_cast<double>(nBufYSize)) ||
    3680             :          nBufYSize == 1)
    3681      752080 :             ? nXSize / static_cast<double>(nBufXSize)
    3682      132972 :             : nYSize / static_cast<double>(nBufYSize);
    3683             : 
    3684             :     /* -------------------------------------------------------------------- */
    3685             :     /*      Find the overview level that largest downsampling factor (most  */
    3686             :     /*      downsampled) that is still less than (or only a little more)    */
    3687             :     /*      downsampled than the request.                                   */
    3688             :     /* -------------------------------------------------------------------- */
    3689      523695 :     const int nOverviewCount = poBand->GetOverviewCount();
    3690      523695 :     GDALRasterBand *poBestOverview = nullptr;
    3691      523695 :     double dfBestDownsamplingFactor = 0;
    3692      523695 :     int nBestOverviewLevel = -1;
    3693             : 
    3694             :     const char *pszOversampligThreshold =
    3695      523695 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    3696             : 
    3697             :     // Note: keep this logic for overview selection in sync between
    3698             :     // gdalwarp_lib.cpp and rasterio.cpp
    3699             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    3700             :     const double dfOversamplingThreshold =
    3701     1047380 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    3702      523686 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    3703     1047370 :             ? 1.0
    3704      523695 :             : 1.2;
    3705      526390 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    3706             :     {
    3707        5547 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    3708       11094 :         if (poOverview == nullptr ||
    3709       11093 :             poOverview->GetXSize() > poBand->GetXSize() ||
    3710        5546 :             poOverview->GetYSize() > poBand->GetYSize())
    3711             :         {
    3712           1 :             continue;
    3713             :         }
    3714             : 
    3715             :         // Compute downsampling factor of this overview
    3716             :         const double dfDownsamplingFactor = std::min(
    3717        5546 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    3718       11092 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    3719             : 
    3720             :         // Is it nearly the requested factor and better (lower) than
    3721             :         // the current best factor?
    3722             :         // Use an epsilon because of numerical instability.
    3723        5546 :         constexpr double EPSILON = 1e-1;
    3724        5654 :         if (dfDownsamplingFactor >=
    3725        5546 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold +
    3726        5438 :                     EPSILON ||
    3727             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    3728             :         {
    3729         108 :             continue;
    3730             :         }
    3731             : 
    3732             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    3733        5438 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    3734             : 
    3735        5438 :         if (pszResampling != nullptr &&
    3736          71 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    3737          16 :             continue;
    3738             : 
    3739             :         // OK, this is our new best overview.
    3740        5422 :         poBestOverview = poOverview;
    3741        5422 :         nBestOverviewLevel = iOverview;
    3742        5422 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    3743             : 
    3744        5422 :         if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
    3745             :             EPSILON)
    3746             :         {
    3747        2852 :             break;
    3748             :         }
    3749             :     }
    3750             : 
    3751             :     /* -------------------------------------------------------------------- */
    3752             :     /*      If we didn't find an overview that helps us, just return        */
    3753             :     /*      indicating failure and the full resolution image will be used.  */
    3754             :     /* -------------------------------------------------------------------- */
    3755      523695 :     if (nBestOverviewLevel < 0)
    3756      520771 :         return -1;
    3757             : 
    3758             :     /* -------------------------------------------------------------------- */
    3759             :     /*      Recompute the source window in terms of the selected            */
    3760             :     /*      overview.                                                       */
    3761             :     /* -------------------------------------------------------------------- */
    3762             :     const double dfXFactor =
    3763        2924 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    3764             :     const double dfYFactor =
    3765        2924 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    3766        2924 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    3767             :              poBestOverview->GetYSize());
    3768             : 
    3769        8772 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    3770        2924 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    3771        8772 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    3772        2924 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    3773        2924 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    3774        2924 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    3775        2924 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    3776           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    3777        2924 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    3778           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    3779             : 
    3780        2924 :     if (psExtraArg)
    3781             :     {
    3782        2924 :         if (psExtraArg->bFloatingPointWindowValidity)
    3783             :         {
    3784          50 :             psExtraArg->dfXOff /= dfXFactor;
    3785          50 :             psExtraArg->dfXSize /= dfXFactor;
    3786          50 :             psExtraArg->dfYOff /= dfYFactor;
    3787          50 :             psExtraArg->dfYSize /= dfYFactor;
    3788             :         }
    3789        2874 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    3790             :         {
    3791          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    3792          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    3793          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    3794          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    3795          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    3796             :         }
    3797             :     }
    3798             : 
    3799        2924 :     nXOff = nOXOff;
    3800        2924 :     nYOff = nOYOff;
    3801        2924 :     nXSize = nOXSize;
    3802        2924 :     nYSize = nOYSize;
    3803             : 
    3804        2924 :     return nBestOverviewLevel;
    3805             : }
    3806             : 
    3807             : /************************************************************************/
    3808             : /*                          OverviewRasterIO()                          */
    3809             : /*                                                                      */
    3810             : /*      Special work function to utilize available overviews to         */
    3811             : /*      more efficiently satisfy downsampled requests.  It will         */
    3812             : /*      return CE_Failure if there are no appropriate overviews         */
    3813             : /*      available but it doesn't emit any error messages.               */
    3814             : /************************************************************************/
    3815             : 
    3816             : //! @cond Doxygen_Suppress
    3817           2 : CPLErr GDALRasterBand::OverviewRasterIO(
    3818             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3819             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3820             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    3821             : 
    3822             : {
    3823             :     GDALRasterIOExtraArg sExtraArg;
    3824           2 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3825             : 
    3826           2 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    3827             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    3828           2 :     if (nOverview < 0)
    3829           1 :         return CE_Failure;
    3830             : 
    3831             :     /* -------------------------------------------------------------------- */
    3832             :     /*      Recast the call in terms of the new raster layer.               */
    3833             :     /* -------------------------------------------------------------------- */
    3834           1 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    3835           1 :     if (poOverviewBand == nullptr)
    3836           0 :         return CE_Failure;
    3837             : 
    3838           1 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    3839             :                                     pData, nBufXSize, nBufYSize, eBufType,
    3840           1 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    3841             : }
    3842             : 
    3843             : /************************************************************************/
    3844             : /*                      TryOverviewRasterIO()                           */
    3845             : /************************************************************************/
    3846             : 
    3847      362416 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    3848             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3849             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3850             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    3851             :     int *pbTried)
    3852             : {
    3853      362416 :     int nXOffMod = nXOff;
    3854      362416 :     int nYOffMod = nYOff;
    3855      362416 :     int nXSizeMod = nXSize;
    3856      362416 :     int nYSizeMod = nYSize;
    3857             :     GDALRasterIOExtraArg sExtraArg;
    3858             : 
    3859      362416 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3860             : 
    3861      362416 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    3862             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    3863             :         &sExtraArg);
    3864             : 
    3865      362416 :     if (iOvrLevel >= 0)
    3866             :     {
    3867          50 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    3868          50 :         if (poOverviewBand)
    3869             :         {
    3870          50 :             *pbTried = TRUE;
    3871          50 :             return poOverviewBand->RasterIO(
    3872             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    3873             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    3874          50 :                 &sExtraArg);
    3875             :         }
    3876             :     }
    3877             : 
    3878      362366 :     *pbTried = FALSE;
    3879      362366 :     return CE_None;
    3880             : }
    3881             : 
    3882             : /************************************************************************/
    3883             : /*                      TryOverviewRasterIO()                           */
    3884             : /************************************************************************/
    3885             : 
    3886      158477 : CPLErr GDALDataset::TryOverviewRasterIO(
    3887             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3888             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3889             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    3890             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    3891             :     int *pbTried)
    3892             : {
    3893      158477 :     int nXOffMod = nXOff;
    3894      158477 :     int nYOffMod = nYOff;
    3895      158477 :     int nXSizeMod = nXSize;
    3896      158477 :     int nYSizeMod = nYSize;
    3897             :     GDALRasterIOExtraArg sExtraArg;
    3898      158477 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3899             : 
    3900      316954 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    3901      158477 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    3902             :         nBufYSize, &sExtraArg);
    3903             : 
    3904      158518 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    3905          41 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    3906             :     {
    3907          41 :         *pbTried = TRUE;
    3908          41 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    3909             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    3910             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    3911          41 :             nLineSpace, nBandSpace, &sExtraArg);
    3912             :     }
    3913             :     else
    3914             :     {
    3915      158436 :         *pbTried = FALSE;
    3916      158436 :         return CE_None;
    3917             :     }
    3918             : }
    3919             : 
    3920             : /************************************************************************/
    3921             : /*                        GetBestOverviewLevel()                        */
    3922             : /*                                                                      */
    3923             : /* Returns the best overview level to satisfy the query or -1 if none   */
    3924             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    3925             : /* overview level                                                       */
    3926             : /************************************************************************/
    3927             : 
    3928           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    3929             :                                            int &nYOff, int &nXSize, int &nYSize,
    3930             :                                            int nBufXSize, int nBufYSize,
    3931             :                                            int nBandCount,
    3932             :                                            const int *panBandMap,
    3933             :                                            GDALRasterIOExtraArg *psExtraArg)
    3934             : {
    3935           4 :     int nOverviewCount = 0;
    3936           4 :     GDALRasterBand *poFirstBand = nullptr;
    3937             : 
    3938             :     /* -------------------------------------------------------------------- */
    3939             :     /* Check that all bands have the same number of overviews and           */
    3940             :     /* that they have all the same size and block dimensions                */
    3941             :     /* -------------------------------------------------------------------- */
    3942          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    3943             :     {
    3944           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    3945           8 :         if (poBand == nullptr)
    3946           0 :             return -1;
    3947           8 :         if (iBand == 0)
    3948             :         {
    3949           4 :             poFirstBand = poBand;
    3950           4 :             nOverviewCount = poBand->GetOverviewCount();
    3951             :         }
    3952           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    3953             :         {
    3954           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    3955             :                              "mismatched overview count, use std method.");
    3956           0 :             return -1;
    3957             :         }
    3958             :         else
    3959             :         {
    3960           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    3961             :             {
    3962           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    3963             :                 GDALRasterBand *poOvrFirstBand =
    3964           0 :                     poFirstBand->GetOverview(iOverview);
    3965           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    3966           0 :                     continue;
    3967             : 
    3968           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    3969           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    3970             :                 {
    3971           0 :                     CPLDebug("GDAL",
    3972             :                              "GDALDataset::GetBestOverviewLevel() ... "
    3973             :                              "mismatched overview sizes, use std method.");
    3974           0 :                     return -1;
    3975             :                 }
    3976           0 :                 int nBlockXSizeFirst = 0;
    3977           0 :                 int nBlockYSizeFirst = 0;
    3978           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    3979             :                                              &nBlockYSizeFirst);
    3980             : 
    3981           0 :                 int nBlockXSizeCurrent = 0;
    3982           0 :                 int nBlockYSizeCurrent = 0;
    3983           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    3984             :                                         &nBlockYSizeCurrent);
    3985             : 
    3986           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    3987           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    3988             :                 {
    3989           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    3990             :                                      "mismatched block sizes, use std method.");
    3991           0 :                     return -1;
    3992             :                 }
    3993             :             }
    3994             :         }
    3995             :     }
    3996           4 :     if (poFirstBand == nullptr)
    3997           0 :         return -1;
    3998             : 
    3999           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    4000             :                                          nYSize, nBufXSize, nBufYSize,
    4001           4 :                                          psExtraArg);
    4002             : }
    4003             : 
    4004             : /************************************************************************/
    4005             : /*                         BlockBasedRasterIO()                         */
    4006             : /*                                                                      */
    4007             : /*      This convenience function implements a dataset level            */
    4008             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    4009             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    4010             : /*      all bands at once, so that a format driver that handles a       */
    4011             : /*      request for different bands of the same block efficiently       */
    4012             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    4013             : /*                                                                      */
    4014             : /*      This method is intended to be called by an overridden           */
    4015             : /*      IRasterIO() method in the driver specific GDALDataset           */
    4016             : /*      derived class.                                                  */
    4017             : /*                                                                      */
    4018             : /*      Default internal implementation of RasterIO() ... utilizes      */
    4019             : /*      the Block access methods to satisfy the request.  This would    */
    4020             : /*      normally only be overridden by formats with overviews.          */
    4021             : /*                                                                      */
    4022             : /*      To keep things relatively simple, this method does not          */
    4023             : /*      currently take advantage of some special cases addressed in     */
    4024             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    4025             : /*      call it when you know it will help.  That is in cases where     */
    4026             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    4027             : /*      implementing interleaved IO efficiently on a block by block     */
    4028             : /*      basis. Overviews will be used when possible.                    */
    4029             : /************************************************************************/
    4030             : 
    4031       63910 : CPLErr GDALDataset::BlockBasedRasterIO(
    4032             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    4033             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4034             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4035             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    4036             : 
    4037             : {
    4038       63910 :     CPLAssert(nullptr != pData);
    4039             : 
    4040       63910 :     GByte **papabySrcBlock = nullptr;
    4041       63910 :     GDALRasterBlock *poBlock = nullptr;
    4042       63910 :     GDALRasterBlock **papoBlocks = nullptr;
    4043       63910 :     int nLBlockX = -1;
    4044       63910 :     int nLBlockY = -1;
    4045             :     int iBufYOff;
    4046             :     int iBufXOff;
    4047       63910 :     int nBlockXSize = 1;
    4048       63910 :     int nBlockYSize = 1;
    4049       63910 :     CPLErr eErr = CE_None;
    4050       63910 :     GDALDataType eDataType = GDT_Byte;
    4051             : 
    4052       63910 :     const bool bUseIntegerRequestCoords =
    4053       63940 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4054          30 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4055          28 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4056             : 
    4057             :     /* -------------------------------------------------------------------- */
    4058             :     /*      Ensure that all bands share a common block size and data type.  */
    4059             :     /* -------------------------------------------------------------------- */
    4060      303005 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4061             :     {
    4062      239093 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4063             : 
    4064      239096 :         if (iBand == 0)
    4065             :         {
    4066       63911 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4067       63910 :             eDataType = poBand->GetRasterDataType();
    4068             :         }
    4069             :         else
    4070             :         {
    4071      175185 :             int nThisBlockXSize = 0;
    4072      175185 :             int nThisBlockYSize = 0;
    4073      175185 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    4074      175185 :             if (nThisBlockXSize != nBlockXSize ||
    4075      175183 :                 nThisBlockYSize != nBlockYSize)
    4076             :             {
    4077           2 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4078             :                                  "mismatched block sizes, use std method.");
    4079           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4080             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4081             :                                          nBandCount, panBandMap, nPixelSpace,
    4082           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4083             :             }
    4084             : 
    4085      175183 :             if (eDataType != poBand->GetRasterDataType() &&
    4086           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    4087             :             {
    4088           2 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4089             :                                  "mismatched band data types, use std method.");
    4090           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4091             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4092             :                                          nBandCount, panBandMap, nPixelSpace,
    4093           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4094             :             }
    4095             :         }
    4096             :     }
    4097             : 
    4098             :     /* ==================================================================== */
    4099             :     /*      In this special case at full resolution we step through in      */
    4100             :     /*      blocks, turning the request over to the per-band                */
    4101             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4102             :     /*      called before proceeding to the next.                           */
    4103             :     /* ==================================================================== */
    4104             : 
    4105       63912 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4106             :     {
    4107             :         GDALRasterIOExtraArg sDummyExtraArg;
    4108       63908 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4109             : 
    4110       63908 :         int nChunkYSize = 0;
    4111       63908 :         int nChunkXSize = 0;
    4112             : 
    4113      210044 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4114             :         {
    4115      147151 :             const int nChunkYOff = iBufYOff + nYOff;
    4116      147151 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4117      147151 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4118       59100 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4119             : 
    4120      816576 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4121             :             {
    4122      670439 :                 const int nChunkXOff = iBufXOff + nXOff;
    4123      670439 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4124      670439 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4125       70147 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4126             : 
    4127      670439 :                 GByte *pabyChunkData =
    4128      670439 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4129      670439 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4130             : 
    4131     3265400 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4132             :                 {
    4133     2595980 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4134             : 
    4135     5191960 :                     eErr = poBand->IRasterIO(
    4136             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4137             :                         nChunkYSize,
    4138     2595980 :                         pabyChunkData +
    4139     2595980 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4140             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4141     2595980 :                         nLineSpace, &sDummyExtraArg);
    4142     2595980 :                     if (eErr != CE_None)
    4143        1014 :                         return eErr;
    4144             :                 }
    4145             :             }
    4146             : 
    4147      164927 :             if (psExtraArg->pfnProgress != nullptr &&
    4148       18790 :                 !psExtraArg->pfnProgress(
    4149      164927 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4150             :                         nBufYSize,
    4151             :                     "", psExtraArg->pProgressData))
    4152             :             {
    4153           1 :                 return CE_Failure;
    4154             :             }
    4155             :         }
    4156             : 
    4157       62893 :         return CE_None;
    4158             :     }
    4159             : 
    4160             :     /* Below code is not compatible with that case. It would need a complete */
    4161             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    4162           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    4163             :     {
    4164           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4165             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4166             :                                  panBandMap, nPixelSpace, nLineSpace,
    4167           0 :                                  nBandSpace, psExtraArg);
    4168             :     }
    4169             : 
    4170             :     /* We could have a smarter implementation, but that will do for now */
    4171           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    4172           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    4173             :     {
    4174           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4175             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4176             :                                  panBandMap, nPixelSpace, nLineSpace,
    4177           0 :                                  nBandSpace, psExtraArg);
    4178             :     }
    4179             : 
    4180             :     /* ==================================================================== */
    4181             :     /*      Loop reading required source blocks to satisfy output           */
    4182             :     /*      request.  This is the most general implementation.              */
    4183             :     /* ==================================================================== */
    4184             : 
    4185           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    4186             : 
    4187             :     papabySrcBlock =
    4188           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    4189             :     papoBlocks =
    4190           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    4191             : 
    4192             :     /* -------------------------------------------------------------------- */
    4193             :     /*      Select an overview level if appropriate.                        */
    4194             :     /* -------------------------------------------------------------------- */
    4195             : 
    4196             :     GDALRasterIOExtraArg sExtraArg;
    4197           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4198           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    4199             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    4200             :         panBandMap, &sExtraArg);
    4201           4 :     if (nOverviewLevel >= 0)
    4202             :     {
    4203           2 :         GetRasterBand(panBandMap[0])
    4204           2 :             ->GetOverview(nOverviewLevel)
    4205           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4206             :     }
    4207             : 
    4208           4 :     double dfXOff = nXOff;
    4209           4 :     double dfYOff = nYOff;
    4210           4 :     double dfXSize = nXSize;
    4211           4 :     double dfYSize = nYSize;
    4212           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    4213             :     {
    4214           2 :         dfXOff = sExtraArg.dfXOff;
    4215           2 :         dfYOff = sExtraArg.dfYOff;
    4216           2 :         dfXSize = sExtraArg.dfXSize;
    4217           2 :         dfYSize = sExtraArg.dfYSize;
    4218             :     }
    4219             : 
    4220             :     /* -------------------------------------------------------------------- */
    4221             :     /*      Compute stepping increment.                                     */
    4222             :     /* -------------------------------------------------------------------- */
    4223           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    4224           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    4225             : 
    4226           4 :     constexpr double EPS = 1e-10;
    4227             :     /* -------------------------------------------------------------------- */
    4228             :     /*      Loop over buffer computing source locations.                    */
    4229             :     /* -------------------------------------------------------------------- */
    4230          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    4231             :     {
    4232             :         GPtrDiff_t iSrcOffset;
    4233             : 
    4234             :         // Add small epsilon to avoid some numeric precision issues.
    4235          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    4236          32 :         const int iSrcY = static_cast<int>(std::min(
    4237          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    4238             : 
    4239          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    4240             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    4241             : 
    4242         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    4243             :         {
    4244         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    4245         270 :             const int iSrcX = static_cast<int>(std::min(
    4246         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    4247             : 
    4248             :             // FIXME: this code likely doesn't work if the dirty block gets
    4249             :             // flushed to disk before being completely written. In the meantime,
    4250             :             // bJustInitialize should probably be set to FALSE even if it is not
    4251             :             // ideal performance wise, and for lossy compression
    4252             : 
    4253             :             /* --------------------------------------------------------------------
    4254             :              */
    4255             :             /*      Ensure we have the appropriate block loaded. */
    4256             :             /* --------------------------------------------------------------------
    4257             :              */
    4258         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    4259         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    4260         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    4261         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    4262             :             {
    4263           4 :                 nLBlockX = iSrcX / nBlockXSize;
    4264           4 :                 nLBlockY = iSrcY / nBlockYSize;
    4265             : 
    4266           4 :                 const bool bJustInitialize =
    4267           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    4268           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    4269           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    4270           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    4271             :                 /*bool bMemZeroBuffer = FALSE;
    4272             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    4273             :                     nXOff <= nLBlockX * nBlockXSize &&
    4274             :                     nYOff <= nLBlockY * nBlockYSize &&
    4275             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    4276             :                      (nXOff + nXSize == GetRasterXSize() &&
    4277             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    4278             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    4279             :                      (nYOff + nYSize == GetRasterYSize() &&
    4280             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    4281             :                 {
    4282             :                     bJustInitialize = TRUE;
    4283             :                     bMemZeroBuffer = TRUE;
    4284             :                 }*/
    4285          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4286             :                 {
    4287           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4288           8 :                     if (nOverviewLevel >= 0)
    4289           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    4290          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    4291           8 :                                                         bJustInitialize);
    4292           8 :                     if (poBlock == nullptr)
    4293             :                     {
    4294           0 :                         eErr = CE_Failure;
    4295           0 :                         goto CleanupAndReturn;
    4296             :                     }
    4297             : 
    4298           8 :                     if (eRWFlag == GF_Write)
    4299           0 :                         poBlock->MarkDirty();
    4300             : 
    4301           8 :                     if (papoBlocks[iBand] != nullptr)
    4302           0 :                         papoBlocks[iBand]->DropLock();
    4303             : 
    4304           8 :                     papoBlocks[iBand] = poBlock;
    4305             : 
    4306           8 :                     papabySrcBlock[iBand] =
    4307           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    4308             :                     /*if( bMemZeroBuffer )
    4309             :                     {
    4310             :                         memset(papabySrcBlock[iBand], 0,
    4311             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    4312             :                     * nBlockYSize);
    4313             :                     }*/
    4314             :                 }
    4315             :             }
    4316             : 
    4317             :             /* --------------------------------------------------------------------
    4318             :              */
    4319             :             /*      Copy over this pixel of data. */
    4320             :             /* --------------------------------------------------------------------
    4321             :              */
    4322         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    4323         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    4324         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    4325         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    4326         270 :                               nBlockXSize) *
    4327         270 :                          nBandDataSize;
    4328             : 
    4329         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    4330             :             {
    4331         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    4332         710 :                 GPtrDiff_t iBandBufOffset =
    4333         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    4334             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    4335             : 
    4336         710 :                 if (eDataType == eBufType)
    4337             :                 {
    4338         710 :                     if (eRWFlag == GF_Read)
    4339         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    4340         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    4341             :                     else
    4342           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    4343             :                                static_cast<const GByte *>(pData) +
    4344           0 :                                    iBandBufOffset,
    4345             :                                nBandDataSize);
    4346             :                 }
    4347             :                 else
    4348             :                 {
    4349             :                     /* type to type conversion ... ouch, this is expensive way
    4350             :                        of handling single words */
    4351             : 
    4352           0 :                     if (eRWFlag == GF_Read)
    4353           0 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
    4354             :                                         static_cast<GByte *>(pData) +
    4355           0 :                                             iBandBufOffset,
    4356             :                                         eBufType, 0, 1);
    4357             :                     else
    4358           0 :                         GDALCopyWords64(static_cast<const GByte *>(pData) +
    4359           0 :                                             iBandBufOffset,
    4360           0 :                                         eBufType, 0, pabySrcBlock + iSrcOffset,
    4361             :                                         eDataType, 0, 1);
    4362             :                 }
    4363             :             }
    4364             : 
    4365         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    4366             :         }
    4367             :     }
    4368             : 
    4369             :     /* -------------------------------------------------------------------- */
    4370             :     /*      CleanupAndReturn.                                               */
    4371             :     /* -------------------------------------------------------------------- */
    4372           4 : CleanupAndReturn:
    4373           4 :     CPLFree(papabySrcBlock);
    4374           4 :     if (papoBlocks != nullptr)
    4375             :     {
    4376          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    4377             :         {
    4378           8 :             if (papoBlocks[iBand] != nullptr)
    4379           8 :                 papoBlocks[iBand]->DropLock();
    4380             :         }
    4381           4 :         CPLFree(papoBlocks);
    4382             :     }
    4383             : 
    4384           4 :     return eErr;
    4385             : }
    4386             : 
    4387             : //! @endcond
    4388             : 
    4389             : /************************************************************************/
    4390             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    4391             : /************************************************************************/
    4392             : 
    4393        3135 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    4394             :                                             GDALRasterBand *poDstPrototypeBand,
    4395             :                                             int nBandCount,
    4396             :                                             int bDstIsCompressed,
    4397             :                                             int bInterleave, int *pnSwathCols,
    4398             :                                             int *pnSwathLines)
    4399             : {
    4400        3135 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4401        3135 :     int nSrcBlockXSize = 0;
    4402        3135 :     int nSrcBlockYSize = 0;
    4403        3135 :     int nBlockXSize = 0;
    4404        3135 :     int nBlockYSize = 0;
    4405             : 
    4406        3135 :     int nXSize = poSrcPrototypeBand->GetXSize();
    4407        3135 :     int nYSize = poSrcPrototypeBand->GetYSize();
    4408             : 
    4409        3135 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    4410        3135 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4411             : 
    4412        3135 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    4413        3135 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    4414             : 
    4415        3135 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4416        3135 :     if (bInterleave)
    4417         548 :         nPixelSize *= nBandCount;
    4418             : 
    4419             :     // aim for one row of blocks.  Do not settle for less.
    4420        3135 :     int nSwathCols = nXSize;
    4421        3135 :     int nSwathLines = nMaxBlockYSize;
    4422             : 
    4423             :     const char *pszSrcCompression =
    4424        3135 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4425        3135 :     if (pszSrcCompression == nullptr)
    4426             :     {
    4427        3109 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    4428        3109 :         if (poSrcDS)
    4429             :             pszSrcCompression =
    4430        3103 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4431             :     }
    4432             : 
    4433             :     /* -------------------------------------------------------------------- */
    4434             :     /*      What will our swath size be?                                    */
    4435             :     /* -------------------------------------------------------------------- */
    4436             :     // When writing interleaved data in a compressed format, we want to be sure
    4437             :     // that each block will only be written once, so the swath size must not be
    4438             :     // greater than the block cache.
    4439        3135 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    4440             :     int nTargetSwathSize;
    4441        3135 :     if (pszSwathSize != nullptr)
    4442           0 :         nTargetSwathSize = static_cast<int>(
    4443           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    4444             :     else
    4445             :     {
    4446             :         // As a default, take one 1/4 of the cache size.
    4447        3135 :         nTargetSwathSize = static_cast<int>(
    4448        3135 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    4449             : 
    4450             :         // but if the minimum idal swath buf size is less, then go for it to
    4451             :         // avoid unnecessarily abusing RAM usage.
    4452             :         // but try to use 10 MB at least.
    4453        3135 :         GIntBig nIdealSwathBufSize =
    4454        3135 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    4455        3135 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    4456             : 
    4457        3135 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    4458        3135 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    4459             :         {
    4460           2 :             nMinTargetSwathSize = nTargetSwathSize;
    4461             :         }
    4462             : 
    4463        3135 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    4464        3125 :             nIdealSwathBufSize < nMinTargetSwathSize)
    4465             :         {
    4466        3122 :             nIdealSwathBufSize = nMinTargetSwathSize;
    4467             :         }
    4468             : 
    4469        3135 :         if (pszSrcCompression != nullptr &&
    4470         180 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    4471           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    4472           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    4473             :         {
    4474           2 :             nIdealSwathBufSize =
    4475           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    4476           2 :                                                  nSrcBlockYSize * nPixelSize);
    4477             :         }
    4478        3135 :         if (nTargetSwathSize > nIdealSwathBufSize)
    4479        3121 :             nTargetSwathSize = static_cast<int>(
    4480        3121 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    4481             :     }
    4482             : 
    4483        3135 :     if (nTargetSwathSize < 1000000)
    4484           8 :         nTargetSwathSize = 1000000;
    4485             : 
    4486             :     /* But let's check that  */
    4487        3352 :     if (bDstIsCompressed && bInterleave &&
    4488         217 :         nTargetSwathSize > GDALGetCacheMax64())
    4489             :     {
    4490           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4491             :                  "When translating into a compressed interleave format, "
    4492             :                  "the block cache size (" CPL_FRMT_GIB ") "
    4493             :                  "should be at least the size of the swath (%d) "
    4494             :                  "(GDAL_SWATH_SIZE config. option)",
    4495             :                  GDALGetCacheMax64(), nTargetSwathSize);
    4496             :     }
    4497             : 
    4498             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    4499             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    4500             : 
    4501             :     // if both input and output datasets are tiled, that the tile dimensions
    4502             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    4503             :     // of input and output block dimensions.
    4504        3135 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    4505          37 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    4506          37 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    4507          37 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4508          37 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4509             :     {
    4510          37 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    4511          37 :                 nPixelSize <=
    4512          37 :             static_cast<GIntBig>(nTargetSwathSize))
    4513             :         {
    4514          37 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    4515          37 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    4516          37 :             if (nSwathCols == 0)
    4517           0 :                 nSwathCols = nMaxBlockXSize;
    4518          37 :             if (nSwathCols > nXSize)
    4519          35 :                 nSwathCols = nXSize;
    4520          37 :             nSwathLines = nMaxBlockYSize;
    4521             : 
    4522          37 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    4523          37 :                 static_cast<GIntBig>(nTargetSwathSize))
    4524             :             {
    4525           0 :                 nSwathCols = nXSize;
    4526           0 :                 nSwathLines = nBlockYSize;
    4527             :             }
    4528             :         }
    4529             :     }
    4530             : 
    4531        3135 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    4532        3135 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    4533        3135 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    4534             :     {
    4535           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    4536           1 :         if (nSwathLines == 0)
    4537           1 :             nSwathLines = 1;
    4538             : 
    4539           1 :         CPLDebug(
    4540             :             "GDAL",
    4541             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    4542             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    4543             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    4544           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    4545             :     }
    4546             :     // If we are processing single scans, try to handle several at once.
    4547             :     // If we are handling swaths already, only grow the swath if a row
    4548             :     // of blocks is substantially less than our target buffer size.
    4549        3134 :     else if (nSwathLines == 1 ||
    4550        2599 :              nMemoryPerCol * nSwathLines <
    4551        2599 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    4552             :     {
    4553        3106 :         nSwathLines = std::min(
    4554             :             nYSize,
    4555        3106 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    4556             : 
    4557             :         /* If possible try to align to source and target block height */
    4558        3106 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    4559         251 :             nSwathLines > nMaxBlockYSize &&
    4560         251 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4561         222 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4562         202 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    4563             :     }
    4564             : 
    4565        3135 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    4566           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    4567           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    4568             :     {
    4569             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    4570           2 :         if (nSwathLines < nSrcBlockYSize)
    4571             :         {
    4572           0 :             nSwathLines = nSrcBlockYSize;
    4573             : 
    4574             :             // Number of pixels that can be read/write simultaneously.
    4575           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    4576           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    4577           0 :             if (nSwathCols == 0)
    4578           0 :                 nSwathCols = nSrcBlockXSize;
    4579           0 :             if (nSwathCols > nXSize)
    4580           0 :                 nSwathCols = nXSize;
    4581             : 
    4582           0 :             CPLDebug(
    4583             :                 "GDAL",
    4584             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    4585             :                 "too high block, "
    4586             :                 "use partial width at one time");
    4587             :         }
    4588           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    4589             :         {
    4590             :             /* Round on a multiple of nSrcBlockYSize */
    4591           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    4592           0 :             CPLDebug(
    4593             :                 "GDAL",
    4594             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    4595             :                 "round nSwathLines to block height : %d",
    4596             :                 nSwathLines);
    4597             :         }
    4598             :     }
    4599        3133 :     else if (bDstIsCompressed)
    4600             :     {
    4601         407 :         if (nSwathLines < nBlockYSize)
    4602             :         {
    4603         145 :             nSwathLines = nBlockYSize;
    4604             : 
    4605             :             // Number of pixels that can be read/write simultaneously.
    4606         145 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    4607         145 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    4608         145 :             if (nSwathCols == 0)
    4609           0 :                 nSwathCols = nBlockXSize;
    4610         145 :             if (nSwathCols > nXSize)
    4611         145 :                 nSwathCols = nXSize;
    4612             : 
    4613         145 :             CPLDebug(
    4614             :                 "GDAL",
    4615             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    4616             :                 "too high block, "
    4617             :                 "use partial width at one time");
    4618             :         }
    4619         262 :         else if ((nSwathLines % nBlockYSize) != 0)
    4620             :         {
    4621             :             // Round on a multiple of nBlockYSize.
    4622           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    4623           9 :             CPLDebug(
    4624             :                 "GDAL",
    4625             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    4626             :                 "round nSwathLines to block height : %d",
    4627             :                 nSwathLines);
    4628             :         }
    4629             :     }
    4630             : 
    4631        3135 :     *pnSwathCols = nSwathCols;
    4632        3135 :     *pnSwathLines = nSwathLines;
    4633        3135 : }
    4634             : 
    4635             : /************************************************************************/
    4636             : /*                     GDALDatasetCopyWholeRaster()                     */
    4637             : /************************************************************************/
    4638             : 
    4639             : /**
    4640             :  * \brief Copy all dataset raster data.
    4641             :  *
    4642             :  * This function copies the complete raster contents of one dataset to
    4643             :  * another similarly configured dataset.  The source and destination
    4644             :  * dataset must have the same number of bands, and the same width
    4645             :  * and height.  The bands do not have to have the same data type.
    4646             :  *
    4647             :  * This function is primarily intended to support implementation of
    4648             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    4649             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    4650             :  * performing the transfer in a pixel interleaved fashion.
    4651             :  *
    4652             :  * Currently the only papszOptions value supported are :
    4653             :  * <ul>
    4654             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    4655             :  * write access pattern (this does not modify the layout of the destination
    4656             :  * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
    4657             :  * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
    4658             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    4659             :  * (GDAL &gt;= 2.2)</li>
    4660             :  * </ul>
    4661             :  * More options may be supported in the future.
    4662             :  *
    4663             :  * @param hSrcDS the source dataset
    4664             :  * @param hDstDS the destination dataset
    4665             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    4666             :  * @param pfnProgress progress reporting function.
    4667             :  * @param pProgressData callback data for progress function.
    4668             :  *
    4669             :  * @return CE_None on success, or CE_Failure on failure.
    4670             :  */
    4671             : 
    4672        3108 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    4673             :                                               GDALDatasetH hDstDS,
    4674             :                                               CSLConstList papszOptions,
    4675             :                                               GDALProgressFunc pfnProgress,
    4676             :                                               void *pProgressData)
    4677             : 
    4678             : {
    4679        3108 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    4680        3108 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    4681             : 
    4682        3108 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    4683        3108 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    4684             : 
    4685        3108 :     if (pfnProgress == nullptr)
    4686           0 :         pfnProgress = GDALDummyProgress;
    4687             : 
    4688             :     /* -------------------------------------------------------------------- */
    4689             :     /*      Confirm the datasets match in size and band counts.             */
    4690             :     /* -------------------------------------------------------------------- */
    4691        3108 :     const int nXSize = poDstDS->GetRasterXSize();
    4692        3108 :     const int nYSize = poDstDS->GetRasterYSize();
    4693        3108 :     const int nBandCount = poDstDS->GetRasterCount();
    4694             : 
    4695        3108 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    4696        6216 :         poSrcDS->GetRasterYSize() != nYSize ||
    4697        3108 :         poSrcDS->GetRasterCount() != nBandCount)
    4698             :     {
    4699           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    4700             :                  "Input and output dataset sizes or band counts do not\n"
    4701             :                  "match in GDALDatasetCopyWholeRaster()");
    4702           0 :         return CE_Failure;
    4703             :     }
    4704             : 
    4705             :     /* -------------------------------------------------------------------- */
    4706             :     /*      Report preliminary (0) progress.                                */
    4707             :     /* -------------------------------------------------------------------- */
    4708        3108 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    4709             :     {
    4710           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    4711             :                  "User terminated CreateCopy()");
    4712           1 :         return CE_Failure;
    4713             :     }
    4714             : 
    4715             :     /* -------------------------------------------------------------------- */
    4716             :     /*      Get our prototype band, and assume the others are similarly     */
    4717             :     /*      configured.                                                     */
    4718             :     /* -------------------------------------------------------------------- */
    4719        3107 :     if (nBandCount == 0)
    4720           0 :         return CE_None;
    4721             : 
    4722        3107 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    4723        3107 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    4724        3107 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4725             : 
    4726             :     /* -------------------------------------------------------------------- */
    4727             :     /*      Do we want to try and do the operation in a pixel               */
    4728             :     /*      interleaved fashion?                                            */
    4729             :     /* -------------------------------------------------------------------- */
    4730        3107 :     bool bInterleave = false;
    4731             :     const char *pszInterleave =
    4732        3107 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    4733        3107 :     if (pszInterleave != nullptr &&
    4734        2756 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    4735         184 :         bInterleave = true;
    4736             : 
    4737        3107 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    4738        3107 :     if (pszInterleave != nullptr &&
    4739        2652 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    4740         495 :         bInterleave = true;
    4741             : 
    4742        3107 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    4743        3107 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    4744           5 :         bInterleave = true;
    4745        3102 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    4746          13 :         bInterleave = false;
    4747             :     // attributes is specific to the TileDB driver
    4748        3089 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    4749           4 :         bInterleave = true;
    4750        3085 :     else if (pszInterleave != nullptr)
    4751             :     {
    4752           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    4753             :                  "Unsupported value for option INTERLEAVE");
    4754             :     }
    4755             : 
    4756             :     // If the destination is compressed, we must try to write blocks just once,
    4757             :     // to save disk space (GTiff case for example), and to avoid data loss
    4758             :     // (JPEG compression for example).
    4759        3107 :     bool bDstIsCompressed = false;
    4760             :     const char *pszDstCompressed =
    4761        3107 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    4762        3107 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    4763         382 :         bDstIsCompressed = true;
    4764             : 
    4765             :     /* -------------------------------------------------------------------- */
    4766             :     /*      What will our swath size be?                                    */
    4767             :     /* -------------------------------------------------------------------- */
    4768             : 
    4769        3107 :     int nSwathCols = 0;
    4770        3107 :     int nSwathLines = 0;
    4771        3107 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    4772             :                                     nBandCount, bDstIsCompressed, bInterleave,
    4773             :                                     &nSwathCols, &nSwathLines);
    4774             : 
    4775        3107 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4776        3107 :     if (bInterleave)
    4777         548 :         nPixelSize *= nBandCount;
    4778             : 
    4779        3107 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    4780        3107 :     if (pSwathBuf == nullptr)
    4781             :     {
    4782           0 :         return CE_Failure;
    4783             :     }
    4784             : 
    4785        3107 :     CPLDebug("GDAL",
    4786             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    4787             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    4788             : 
    4789             :     // Advise the source raster that we are going to read it completely
    4790             :     // Note: this might already have been done by GDALCreateCopy() in the
    4791             :     // likely case this function is indirectly called by it
    4792        3107 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    4793        3107 :                         nullptr, nullptr);
    4794             : 
    4795             :     /* ==================================================================== */
    4796             :     /*      Band oriented (uninterleaved) case.                             */
    4797             :     /* ==================================================================== */
    4798        3107 :     CPLErr eErr = CE_None;
    4799             :     const bool bCheckHoles =
    4800        3107 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    4801             : 
    4802        3107 :     if (!bInterleave)
    4803             :     {
    4804             :         GDALRasterIOExtraArg sExtraArg;
    4805        2559 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    4806        2559 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    4807             : 
    4808        7677 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    4809        2559 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    4810        2559 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    4811        2559 :         GIntBig nBlocksDone = 0;
    4812             : 
    4813        7501 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    4814             :         {
    4815        4942 :             int nBand = iBand + 1;
    4816             : 
    4817       10199 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4818             :             {
    4819        5257 :                 int nThisLines = nSwathLines;
    4820             : 
    4821        5257 :                 if (iY + nThisLines > nYSize)
    4822         375 :                     nThisLines = nYSize - iY;
    4823             : 
    4824       10514 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    4825        5257 :                      iX += nSwathCols)
    4826             :                 {
    4827        5257 :                     int nThisCols = nSwathCols;
    4828             : 
    4829        5257 :                     if (iX + nThisCols > nXSize)
    4830           0 :                         nThisCols = nXSize - iX;
    4831             : 
    4832        5257 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    4833        5257 :                     if (bCheckHoles)
    4834             :                     {
    4835             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    4836        3632 :                                       ->GetDataCoverageStatus(
    4837             :                                           iX, iY, nThisCols, nThisLines,
    4838             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    4839             :                     }
    4840        5257 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4841             :                     {
    4842        5253 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    4843       10506 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    4844        5253 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    4845        5253 :                             (nBlocksDone + 0.5) /
    4846        5253 :                                 static_cast<double>(nTotalBlocks),
    4847             :                             pfnProgress, pProgressData);
    4848        5253 :                         if (sExtraArg.pProgressData == nullptr)
    4849        1603 :                             sExtraArg.pfnProgress = nullptr;
    4850             : 
    4851        5253 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    4852             :                                                  nThisLines, pSwathBuf,
    4853             :                                                  nThisCols, nThisLines, eDT, 1,
    4854             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    4855             : 
    4856        5253 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    4857             : 
    4858        5253 :                         if (eErr == CE_None)
    4859        5246 :                             eErr = poDstDS->RasterIO(
    4860             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    4861             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    4862             :                                 &nBand, 0, 0, 0, nullptr);
    4863             :                     }
    4864             : 
    4865        5257 :                     nBlocksDone++;
    4866       10472 :                     if (eErr == CE_None &&
    4867        5215 :                         !pfnProgress(nBlocksDone /
    4868        5215 :                                          static_cast<double>(nTotalBlocks),
    4869             :                                      nullptr, pProgressData))
    4870             :                     {
    4871           2 :                         eErr = CE_Failure;
    4872           2 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    4873             :                                  "User terminated CreateCopy()");
    4874             :                     }
    4875             :                 }
    4876             :             }
    4877             :         }
    4878             :     }
    4879             : 
    4880             :     /* ==================================================================== */
    4881             :     /*      Pixel interleaved case.                                         */
    4882             :     /* ==================================================================== */
    4883             :     else /* if( bInterleave ) */
    4884             :     {
    4885             :         GDALRasterIOExtraArg sExtraArg;
    4886         548 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    4887         548 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    4888             : 
    4889         548 :         const GIntBig nTotalBlocks =
    4890         548 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    4891         548 :             DIV_ROUND_UP(nXSize, nSwathCols);
    4892         548 :         GIntBig nBlocksDone = 0;
    4893             : 
    4894        1311 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4895             :         {
    4896         763 :             int nThisLines = nSwathLines;
    4897             : 
    4898         763 :             if (iY + nThisLines > nYSize)
    4899         190 :                 nThisLines = nYSize - iY;
    4900             : 
    4901        1531 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    4902             :             {
    4903         768 :                 int nThisCols = nSwathCols;
    4904             : 
    4905         768 :                 if (iX + nThisCols > nXSize)
    4906           3 :                     nThisCols = nXSize - iX;
    4907             : 
    4908         768 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    4909         768 :                 if (bCheckHoles)
    4910             :                 {
    4911         539 :                     nStatus = 0;
    4912         592 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    4913             :                     {
    4914         573 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    4915         573 :                                        ->GetDataCoverageStatus(
    4916             :                                            iX, iY, nThisCols, nThisLines,
    4917             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    4918         573 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4919         520 :                             break;
    4920             :                     }
    4921             :                 }
    4922         768 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4923             :                 {
    4924         749 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    4925        1498 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    4926         749 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    4927         749 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    4928             :                         pfnProgress, pProgressData);
    4929         749 :                     if (sExtraArg.pProgressData == nullptr)
    4930         342 :                         sExtraArg.pfnProgress = nullptr;
    4931             : 
    4932         749 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    4933             :                                              nThisLines, pSwathBuf, nThisCols,
    4934             :                                              nThisLines, eDT, nBandCount,
    4935             :                                              nullptr, 0, 0, 0, &sExtraArg);
    4936             : 
    4937         749 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    4938             : 
    4939         749 :                     if (eErr == CE_None)
    4940         748 :                         eErr = poDstDS->RasterIO(
    4941             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    4942             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    4943             :                             0, 0, nullptr);
    4944             :                 }
    4945             : 
    4946         768 :                 nBlocksDone++;
    4947        1532 :                 if (eErr == CE_None &&
    4948         764 :                     !pfnProgress(nBlocksDone /
    4949         764 :                                      static_cast<double>(nTotalBlocks),
    4950             :                                  nullptr, pProgressData))
    4951             :                 {
    4952           1 :                     eErr = CE_Failure;
    4953           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    4954             :                              "User terminated CreateCopy()");
    4955             :                 }
    4956             :             }
    4957             :         }
    4958             :     }
    4959             : 
    4960             :     /* -------------------------------------------------------------------- */
    4961             :     /*      Cleanup                                                         */
    4962             :     /* -------------------------------------------------------------------- */
    4963        3107 :     CPLFree(pSwathBuf);
    4964             : 
    4965        3107 :     return eErr;
    4966             : }
    4967             : 
    4968             : /************************************************************************/
    4969             : /*                     GDALRasterBandCopyWholeRaster()                  */
    4970             : /************************************************************************/
    4971             : 
    4972             : /**
    4973             :  * \brief Copy a whole raster band
    4974             :  *
    4975             :  * This function copies the complete raster contents of one band to
    4976             :  * another similarly configured band.  The source and destination
    4977             :  * bands must have the same width and height.  The bands do not have
    4978             :  * to have the same data type.
    4979             :  *
    4980             :  * It implements efficient copying, in particular "chunking" the copy in
    4981             :  * substantial blocks.
    4982             :  *
    4983             :  * Currently the only papszOptions value supported are :
    4984             :  * <ul>
    4985             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    4986             :  * achieve best compression.</li>
    4987             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    4988             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    4989             :  * </ul>
    4990             :  *
    4991             :  * @param hSrcBand the source band
    4992             :  * @param hDstBand the destination band
    4993             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    4994             :  * @param pfnProgress progress reporting function.
    4995             :  * @param pProgressData callback data for progress function.
    4996             :  *
    4997             :  * @return CE_None on success, or CE_Failure on failure.
    4998             :  */
    4999             : 
    5000          28 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    5001             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    5002             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    5003             :     void *pProgressData)
    5004             : 
    5005             : {
    5006          28 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5007          28 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    5008             : 
    5009          28 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    5010          28 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    5011          28 :     CPLErr eErr = CE_None;
    5012             : 
    5013          28 :     if (pfnProgress == nullptr)
    5014           2 :         pfnProgress = GDALDummyProgress;
    5015             : 
    5016             :     /* -------------------------------------------------------------------- */
    5017             :     /*      Confirm the datasets match in size and band counts.             */
    5018             :     /* -------------------------------------------------------------------- */
    5019          28 :     int nXSize = poSrcBand->GetXSize();
    5020          28 :     int nYSize = poSrcBand->GetYSize();
    5021             : 
    5022          28 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    5023             :     {
    5024           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    5025             :                  "Input and output band sizes do not\n"
    5026             :                  "match in GDALRasterBandCopyWholeRaster()");
    5027           0 :         return CE_Failure;
    5028             :     }
    5029             : 
    5030             :     /* -------------------------------------------------------------------- */
    5031             :     /*      Report preliminary (0) progress.                                */
    5032             :     /* -------------------------------------------------------------------- */
    5033          28 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5034             :     {
    5035           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5036             :                  "User terminated CreateCopy()");
    5037           0 :         return CE_Failure;
    5038             :     }
    5039             : 
    5040          28 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5041             : 
    5042             :     // If the destination is compressed, we must try to write blocks just once,
    5043             :     // to save disk space (GTiff case for example), and to avoid data loss
    5044             :     // (JPEG compression for example).
    5045          28 :     bool bDstIsCompressed = false;
    5046             :     const char *pszDstCompressed =
    5047          28 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5048          28 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5049          25 :         bDstIsCompressed = true;
    5050             : 
    5051             :     /* -------------------------------------------------------------------- */
    5052             :     /*      What will our swath size be?                                    */
    5053             :     /* -------------------------------------------------------------------- */
    5054             : 
    5055          28 :     int nSwathCols = 0;
    5056          28 :     int nSwathLines = 0;
    5057          28 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5058             :                                     FALSE, &nSwathCols, &nSwathLines);
    5059             : 
    5060          28 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5061             : 
    5062          28 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5063          28 :     if (pSwathBuf == nullptr)
    5064             :     {
    5065           0 :         return CE_Failure;
    5066             :     }
    5067             : 
    5068          28 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5069             :              nSwathCols, nSwathLines);
    5070             : 
    5071             :     const bool bCheckHoles =
    5072          28 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5073             : 
    5074             :     // Advise the source raster that we are going to read it completely
    5075          28 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    5076             : 
    5077             :     /* ==================================================================== */
    5078             :     /*      Band oriented (uninterleaved) case.                             */
    5079             :     /* ==================================================================== */
    5080             : 
    5081          70 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5082             :     {
    5083          42 :         int nThisLines = nSwathLines;
    5084             : 
    5085          42 :         if (iY + nThisLines > nYSize)
    5086           8 :             nThisLines = nYSize - iY;
    5087             : 
    5088          84 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5089             :         {
    5090          42 :             int nThisCols = nSwathCols;
    5091             : 
    5092          42 :             if (iX + nThisCols > nXSize)
    5093           0 :                 nThisCols = nXSize - iX;
    5094             : 
    5095          42 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5096          42 :             if (bCheckHoles)
    5097             :             {
    5098           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5099             :                     iX, iY, nThisCols, nThisLines,
    5100             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5101             :             }
    5102          42 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5103             :             {
    5104          42 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5105             :                                            nThisLines, pSwathBuf, nThisCols,
    5106             :                                            nThisLines, eDT, 0, 0, nullptr);
    5107             : 
    5108          42 :                 if (eErr == CE_None)
    5109          42 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5110             :                                                nThisLines, pSwathBuf, nThisCols,
    5111             :                                                nThisLines, eDT, 0, 0, nullptr);
    5112             :             }
    5113             : 
    5114          84 :             if (eErr == CE_None &&
    5115          42 :                 !pfnProgress((iY + nThisLines) / static_cast<float>(nYSize),
    5116             :                              nullptr, pProgressData))
    5117             :             {
    5118           0 :                 eErr = CE_Failure;
    5119           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5120             :                          "User terminated CreateCopy()");
    5121             :             }
    5122             :         }
    5123             :     }
    5124             : 
    5125             :     /* -------------------------------------------------------------------- */
    5126             :     /*      Cleanup                                                         */
    5127             :     /* -------------------------------------------------------------------- */
    5128          28 :     CPLFree(pSwathBuf);
    5129             : 
    5130          28 :     return eErr;
    5131             : }
    5132             : 
    5133             : /************************************************************************/
    5134             : /*                      GDALCopyRasterIOExtraArg ()                     */
    5135             : /************************************************************************/
    5136             : 
    5137      525398 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5138             :                               GDALRasterIOExtraArg *psSrcArg)
    5139             : {
    5140      525398 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5141      525398 :     if (psSrcArg)
    5142             :     {
    5143      525398 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5144      525398 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5145      525398 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5146      525398 :         psDestArg->bFloatingPointWindowValidity =
    5147      525398 :             psSrcArg->bFloatingPointWindowValidity;
    5148      525398 :         if (psSrcArg->bFloatingPointWindowValidity)
    5149             :         {
    5150      204009 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5151      204009 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5152      204009 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5153      204009 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5154             :         }
    5155      525398 :         if (psSrcArg->nVersion >= 2)
    5156             :         {
    5157      525398 :             psDestArg->bUseOnlyThisScale = psSrcArg->bUseOnlyThisScale;
    5158             :         }
    5159             :     }
    5160      525398 : }
    5161             : 
    5162             : /************************************************************************/
    5163             : /*                         HasOnlyNoData()                              */
    5164             : /************************************************************************/
    5165             : 
    5166    24858130 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    5167             : {
    5168    24858130 :     return value == noDataValue;
    5169             : }
    5170             : 
    5171           0 : template <> bool IsEqualToNoData<GFloat16>(GFloat16 value, GFloat16 noDataValue)
    5172             : {
    5173             :     using std::isnan;
    5174           0 :     return isnan(noDataValue) ? isnan(value) : value == noDataValue;
    5175             : }
    5176             : 
    5177      560433 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    5178             : {
    5179      560433 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5180             : }
    5181             : 
    5182    13481800 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    5183             : {
    5184    13481800 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5185             : }
    5186             : 
    5187             : template <class T>
    5188       15877 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    5189             :                            size_t nHeight, size_t nLineStride,
    5190             :                            size_t nComponents)
    5191             : {
    5192             :     // Fast test: check the 4 corners and the middle pixel.
    5193       30853 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    5194             :     {
    5195       32495 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    5196       16162 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    5197       15938 :                               noDataValue) &&
    5198       15938 :               IsEqualToNoData(
    5199       15938 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    5200       15938 :                               nComponents +
    5201             :                           iBand],
    5202       14989 :                   noDataValue) &&
    5203       14989 :               IsEqualToNoData(
    5204       14989 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    5205             :                   noDataValue) &&
    5206       14981 :               IsEqualToNoData(
    5207       14981 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    5208       14981 :                               nComponents +
    5209             :                           iBand],
    5210             :                   noDataValue)))
    5211             :         {
    5212        1357 :             return false;
    5213             :         }
    5214             :     }
    5215             : 
    5216             :     // Test all pixels.
    5217       45028 :     for (size_t iY = 0; iY < nHeight; iY++)
    5218             :     {
    5219       30562 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    5220    38852440 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    5221             :         {
    5222    38821960 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    5223             :             {
    5224          54 :                 return false;
    5225             :             }
    5226             :         }
    5227             :     }
    5228       14466 :     return true;
    5229             : }
    5230             : 
    5231             : /************************************************************************/
    5232             : /*                    GDALBufferHasOnlyNoData()                         */
    5233             : /************************************************************************/
    5234             : 
    5235       42598 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    5236             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    5237             :                              size_t nComponents, int nBitsPerSample,
    5238             :                              GDALBufferSampleFormat nSampleFormat)
    5239             : {
    5240             :     // In the case where the nodata is 0, we can compare several bytes at
    5241             :     // once. Select the largest natural integer type for the architecture.
    5242             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    5243             :     // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    5244             :     typedef std::uint64_t WordType;
    5245             : #else
    5246             :     typedef std::uint32_t WordType;
    5247             : #endif
    5248       42598 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    5249             :         // Do not use this optimized code path for floating point numbers,
    5250             :         // as it can't detect negative zero.
    5251             :         nSampleFormat != GSF_FLOATING_POINT)
    5252             :     {
    5253       26715 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    5254       26715 :         const size_t nSize =
    5255       26715 :             (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
    5256       26715 :         size_t i = 0;
    5257             :         const size_t nInitialIters =
    5258       53430 :             std::min(sizeof(WordType) -
    5259       26715 :                          static_cast<size_t>(
    5260             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    5261             :                              sizeof(WordType)),
    5262       26715 :                      nSize);
    5263      220399 :         for (; i < nInitialIters; i++)
    5264             :         {
    5265      198058 :             if (pabyBuffer[i])
    5266        4374 :                 return false;
    5267             :         }
    5268    16516400 :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    5269             :         {
    5270    16501200 :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    5271        7197 :                 return false;
    5272             :         }
    5273       52533 :         for (; i < nSize; i++)
    5274             :         {
    5275       37394 :             if (pabyBuffer[i])
    5276           5 :                 return false;
    5277             :         }
    5278       15139 :         return true;
    5279             :     }
    5280             : 
    5281       15883 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    5282             :     {
    5283       22270 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    5284       11135 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    5285       11135 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    5286       11135 :                               nHeight, nLineStride, nComponents);
    5287             :     }
    5288        4748 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    5289             :     {
    5290             :         // Use unsigned implementation by converting the nodatavalue to
    5291             :         // unsigned
    5292          63 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    5293          31 :                HasOnlyNoDataT(
    5294             :                    static_cast<const uint8_t *>(pBuffer),
    5295          31 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    5296          32 :                    nWidth, nHeight, nLineStride, nComponents);
    5297             :     }
    5298        4716 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    5299             :     {
    5300          21 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    5301          10 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    5302          10 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    5303          11 :                               nHeight, nLineStride, nComponents);
    5304             :     }
    5305        4705 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    5306             :     {
    5307             :         // Use unsigned implementation by converting the nodatavalue to
    5308             :         // unsigned
    5309          97 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    5310          48 :                HasOnlyNoDataT(
    5311             :                    static_cast<const uint16_t *>(pBuffer),
    5312          48 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    5313          49 :                    nWidth, nHeight, nLineStride, nComponents);
    5314             :     }
    5315        4656 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    5316             :     {
    5317          73 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    5318          36 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    5319             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    5320          37 :                               nHeight, nLineStride, nComponents);
    5321             :     }
    5322        4619 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    5323             :     {
    5324             :         // Use unsigned implementation by converting the nodatavalue to
    5325             :         // unsigned
    5326          23 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    5327          11 :                HasOnlyNoDataT(
    5328             :                    static_cast<const uint32_t *>(pBuffer),
    5329          11 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    5330          12 :                    nWidth, nHeight, nLineStride, nComponents);
    5331             :     }
    5332        4607 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    5333             :     {
    5334          56 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    5335          28 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    5336             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    5337          28 :                               nHeight, nLineStride, nComponents);
    5338             :     }
    5339        4579 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    5340             :     {
    5341             :         // Use unsigned implementation by converting the nodatavalue to
    5342             :         // unsigned
    5343           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    5344           0 :                HasOnlyNoDataT(
    5345             :                    static_cast<const uint64_t *>(pBuffer),
    5346           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    5347           0 :                    nWidth, nHeight, nLineStride, nComponents);
    5348             :     }
    5349        4579 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_FLOATING_POINT)
    5350             :     {
    5351           0 :         return (std::isnan(dfNoDataValue) ||
    5352           0 :                 GDALIsValueInRange<GFloat16>(dfNoDataValue)) &&
    5353           0 :                HasOnlyNoDataT(static_cast<const GFloat16 *>(pBuffer),
    5354             :                               static_cast<GFloat16>(dfNoDataValue), nWidth,
    5355           0 :                               nHeight, nLineStride, nComponents);
    5356             :     }
    5357        4579 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    5358             :     {
    5359         750 :         return (std::isnan(dfNoDataValue) ||
    5360        1499 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    5361         749 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    5362             :                               static_cast<float>(dfNoDataValue), nWidth,
    5363         750 :                               nHeight, nLineStride, nComponents);
    5364             :     }
    5365        3829 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    5366             :     {
    5367        3829 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    5368             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    5369        3829 :                               nComponents);
    5370             :     }
    5371           0 :     return false;
    5372             : }
    5373             : 
    5374             : #ifdef HAVE_SSE2
    5375             : 
    5376             : /************************************************************************/
    5377             : /*                    GDALDeinterleave3Byte()                           */
    5378             : /************************************************************************/
    5379             : 
    5380             : #if defined(__GNUC__) && !defined(__clang__)
    5381             : __attribute__((optimize("no-tree-vectorize")))
    5382             : #endif
    5383             : static void
    5384      152792 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5385             :                       GByte *CPL_RESTRICT pabyDest0,
    5386             :                       GByte *CPL_RESTRICT pabyDest1,
    5387             :                       GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5388             : #ifdef USE_NEON_OPTIMIZATIONS
    5389             : {
    5390             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5391             :                                        nIters);
    5392             : }
    5393             : #else
    5394             : {
    5395             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5396      152792 :     if (CPLHaveRuntimeSSSE3())
    5397             :     {
    5398      152806 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5399      152797 :                                            pabyDest2, nIters);
    5400             :     }
    5401             : #endif
    5402             : 
    5403           0 :     size_t i = 0;
    5404           0 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    5405           0 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    5406           0 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    5407           0 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    5408             :          sizeof(unsigned int)) == 0)
    5409             :     {
    5410             :         // Slightly better than GCC autovectorizer
    5411          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    5412             :         {
    5413          15 :             unsigned int word0 =
    5414          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    5415          15 :             unsigned int word1 =
    5416          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    5417          15 :             unsigned int word2 =
    5418          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    5419          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    5420          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    5421          15 :                 ((word2 >> 8) << 24);
    5422          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    5423          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    5424          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    5425          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    5426          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    5427          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    5428          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    5429             :         }
    5430             :     }
    5431             : #if defined(__clang__)
    5432             : #pragma clang loop vectorize(disable)
    5433             : #endif
    5434           0 :     for (; i < nIters; ++i)
    5435             :     {
    5436           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    5437           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    5438           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    5439             :     }
    5440             : }
    5441             : #endif
    5442             : 
    5443             : /************************************************************************/
    5444             : /*                    GDALDeinterleave4Byte()                           */
    5445             : /************************************************************************/
    5446             : 
    5447             : #if !defined(__GNUC__) || defined(__clang__)
    5448             : 
    5449             : /************************************************************************/
    5450             : /*                         deinterleave()                               */
    5451             : /************************************************************************/
    5452             : 
    5453             : template <bool SHIFT, bool MASK>
    5454             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    5455             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    5456             : {
    5457             :     // Set higher 24bit of each int32 packed word to 0
    5458             :     if (SHIFT)
    5459             :     {
    5460             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    5461             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    5462             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    5463             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    5464             :     }
    5465             :     __m128i xmm0;
    5466             :     __m128i xmm1;
    5467             :     __m128i xmm2;
    5468             :     __m128i xmm3;
    5469             :     if (MASK)
    5470             :     {
    5471             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    5472             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    5473             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    5474             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    5475             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    5476             :     }
    5477             :     else
    5478             :     {
    5479             :         xmm0 = xmm0_ori;
    5480             :         xmm1 = xmm1_ori;
    5481             :         xmm2 = xmm2_ori;
    5482             :         xmm3 = xmm3_ori;
    5483             :     }
    5484             :     // Pack int32 to int16
    5485             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    5486             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    5487             :     // Pack int16 to uint8
    5488             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    5489             :     return xmm0;
    5490             : }
    5491             : 
    5492             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5493             :                                   GByte *CPL_RESTRICT pabyDest0,
    5494             :                                   GByte *CPL_RESTRICT pabyDest1,
    5495             :                                   GByte *CPL_RESTRICT pabyDest2,
    5496             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5497             : #ifdef USE_NEON_OPTIMIZATIONS
    5498             : {
    5499             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5500             :                                        pabyDest3, nIters);
    5501             : }
    5502             : #else
    5503             : {
    5504             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5505             :     if (CPLHaveRuntimeSSSE3())
    5506             :     {
    5507             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5508             :                                            pabyDest2, pabyDest3, nIters);
    5509             :     }
    5510             : #endif
    5511             : 
    5512             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    5513             :     // do something slightly better.
    5514             :     size_t i = 0;
    5515             :     for (; i + 15 < nIters; i += 16)
    5516             :     {
    5517             :         __m128i xmm0_ori = _mm_loadu_si128(
    5518             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    5519             :         __m128i xmm1_ori = _mm_loadu_si128(
    5520             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    5521             :         __m128i xmm2_ori = _mm_loadu_si128(
    5522             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    5523             :         __m128i xmm3_ori = _mm_loadu_si128(
    5524             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    5525             : 
    5526             :         _mm_storeu_si128(
    5527             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    5528             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5529             :         _mm_storeu_si128(
    5530             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    5531             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5532             :         _mm_storeu_si128(
    5533             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    5534             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5535             :         _mm_storeu_si128(
    5536             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    5537             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5538             :     }
    5539             : 
    5540             : #if defined(__clang__)
    5541             : #pragma clang loop vectorize(disable)
    5542             : #endif
    5543             :     for (; i < nIters; ++i)
    5544             :     {
    5545             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5546             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5547             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5548             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5549             :     }
    5550             : }
    5551             : #endif
    5552             : #else
    5553             : // GCC autovectorizer does an excellent job
    5554       61587 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    5555             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    5556             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    5557             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5558             : {
    5559   528343000 :     for (size_t i = 0; i < nIters; ++i)
    5560             :     {
    5561   528282000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    5562   528282000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    5563   528282000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    5564   528282000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    5565             :     }
    5566       61587 : }
    5567             : #endif
    5568             : 
    5569             : #else
    5570             : 
    5571             : /************************************************************************/
    5572             : /*                    GDALDeinterleave3Byte()                           */
    5573             : /************************************************************************/
    5574             : 
    5575             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    5576             : // how to auto-vectorize
    5577             : // #if defined(__GNUC__)
    5578             : //__attribute__((optimize("tree-vectorize")))
    5579             : // #endif
    5580             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5581             :                                   GByte *CPL_RESTRICT pabyDest0,
    5582             :                                   GByte *CPL_RESTRICT pabyDest1,
    5583             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5584             : {
    5585             :     for (size_t i = 0; i < nIters; ++i)
    5586             :     {
    5587             :         pabyDest0[i] = pabySrc[3 * i + 0];
    5588             :         pabyDest1[i] = pabySrc[3 * i + 1];
    5589             :         pabyDest2[i] = pabySrc[3 * i + 2];
    5590             :     }
    5591             : }
    5592             : 
    5593             : /************************************************************************/
    5594             : /*                    GDALDeinterleave4Byte()                           */
    5595             : /************************************************************************/
    5596             : 
    5597             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    5598             : // how to auto-vectorize
    5599             : // #if defined(__GNUC__)
    5600             : //__attribute__((optimize("tree-vectorize")))
    5601             : // #endif
    5602             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5603             :                                   GByte *CPL_RESTRICT pabyDest0,
    5604             :                                   GByte *CPL_RESTRICT pabyDest1,
    5605             :                                   GByte *CPL_RESTRICT pabyDest2,
    5606             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5607             : {
    5608             :     for (size_t i = 0; i < nIters; ++i)
    5609             :     {
    5610             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5611             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5612             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5613             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5614             :     }
    5615             : }
    5616             : 
    5617             : #endif
    5618             : 
    5619             : /************************************************************************/
    5620             : /*                      GDALDeinterleave()                              */
    5621             : /************************************************************************/
    5622             : 
    5623             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    5624             :     buffers.
    5625             : 
    5626             :     In pseudo-code
    5627             :     \verbatim
    5628             :     for(size_t i = 0; i < nIters; ++i)
    5629             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    5630             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    5631             :     \endverbatim
    5632             : 
    5633             :     The implementation is optimized for a few cases, like de-interleaving
    5634             :     of 3 or 4-components Byte buffers.
    5635             : 
    5636             :     \since GDAL 3.6
    5637             :  */
    5638      214732 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    5639             :                       int nComponents, void **ppDestBuffer,
    5640             :                       GDALDataType eDestDT, size_t nIters)
    5641             : {
    5642      214732 :     if (eSourceDT == eDestDT)
    5643             :     {
    5644      214709 :         if (eSourceDT == GDT_Byte || eSourceDT == GDT_Int8)
    5645             :         {
    5646      214391 :             if (nComponents == 3)
    5647             :             {
    5648      152797 :                 const GByte *CPL_RESTRICT pabySrc =
    5649             :                     static_cast<const GByte *>(pSourceBuffer);
    5650      152797 :                 GByte *CPL_RESTRICT pabyDest0 =
    5651             :                     static_cast<GByte *>(ppDestBuffer[0]);
    5652      152797 :                 GByte *CPL_RESTRICT pabyDest1 =
    5653             :                     static_cast<GByte *>(ppDestBuffer[1]);
    5654      152797 :                 GByte *CPL_RESTRICT pabyDest2 =
    5655             :                     static_cast<GByte *>(ppDestBuffer[2]);
    5656      152797 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5657             :                                       nIters);
    5658      152803 :                 return;
    5659             :             }
    5660       61594 :             else if (nComponents == 4)
    5661             :             {
    5662       61587 :                 const GByte *CPL_RESTRICT pabySrc =
    5663             :                     static_cast<const GByte *>(pSourceBuffer);
    5664       61587 :                 GByte *CPL_RESTRICT pabyDest0 =
    5665             :                     static_cast<GByte *>(ppDestBuffer[0]);
    5666       61587 :                 GByte *CPL_RESTRICT pabyDest1 =
    5667             :                     static_cast<GByte *>(ppDestBuffer[1]);
    5668       61587 :                 GByte *CPL_RESTRICT pabyDest2 =
    5669             :                     static_cast<GByte *>(ppDestBuffer[2]);
    5670       61587 :                 GByte *CPL_RESTRICT pabyDest3 =
    5671             :                     static_cast<GByte *>(ppDestBuffer[3]);
    5672       61587 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5673             :                                       pabyDest3, nIters);
    5674       61587 :                 return;
    5675           7 :             }
    5676             :         }
    5677             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    5678             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    5679             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    5680         636 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    5681         318 :                  CPLHaveRuntimeSSSE3())
    5682             :         {
    5683         318 :             if (nComponents == 3)
    5684             :             {
    5685         123 :                 const GUInt16 *CPL_RESTRICT panSrc =
    5686             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    5687         123 :                 GUInt16 *CPL_RESTRICT panDest0 =
    5688             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    5689         123 :                 GUInt16 *CPL_RESTRICT panDest1 =
    5690             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    5691         123 :                 GUInt16 *CPL_RESTRICT panDest2 =
    5692             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    5693         123 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    5694             :                                               panDest2, nIters);
    5695         123 :                 return;
    5696             :             }
    5697             : #if !defined(__INTEL_CLANG_COMPILER)
    5698             :             // ICC autovectorizer doesn't do a good job, at least with icx
    5699             :             // 2022.1.0.20220316
    5700         195 :             else if (nComponents == 4)
    5701             :             {
    5702         195 :                 const GUInt16 *CPL_RESTRICT panSrc =
    5703             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    5704         195 :                 GUInt16 *CPL_RESTRICT panDest0 =
    5705             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    5706         195 :                 GUInt16 *CPL_RESTRICT panDest1 =
    5707             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    5708         195 :                 GUInt16 *CPL_RESTRICT panDest2 =
    5709             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    5710         195 :                 GUInt16 *CPL_RESTRICT panDest3 =
    5711             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    5712         195 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    5713             :                                               panDest2, panDest3, nIters);
    5714         195 :                 return;
    5715             :             }
    5716             : #endif
    5717             :         }
    5718             : #endif
    5719             :     }
    5720             : 
    5721          30 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    5722          29 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    5723         108 :     for (int iComp = 0; iComp < nComponents; iComp++)
    5724             :     {
    5725          79 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    5726          79 :                             iComp * nSourceDTSize,
    5727             :                         eSourceDT, nComponents * nSourceDTSize,
    5728          79 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    5729             :     }
    5730             : }
    5731             : 
    5732             : /************************************************************************/
    5733             : /*                    GDALTranspose2DSingleToSingle()                   */
    5734             : /************************************************************************/
    5735             : /**
    5736             :  * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
    5737             :  *
    5738             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    5739             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    5740             :  * @param nSrcWidth Width of pSrc array.
    5741             :  * @param nSrcHeight Height of pSrc array.
    5742             :  */
    5743             : 
    5744             : template <class DST, class SRC>
    5745         145 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
    5746             :                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    5747             :                                    size_t nSrcHeight)
    5748             : {
    5749         145 :     constexpr size_t blocksize = 32;
    5750         315 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    5751             :     {
    5752         170 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    5753         390 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    5754             :         {
    5755             :             // transpose the block beginning at [i,j]
    5756         220 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    5757        2509 :             for (size_t k = i; k < max_k; ++k)
    5758             :             {
    5759       41017 :                 for (size_t l = j; l < max_l; ++l)
    5760             :                 {
    5761       38728 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    5762       38728 :                                  pDst[k + l * nSrcHeight]);
    5763             :                 }
    5764             :             }
    5765             :         }
    5766             :     }
    5767         145 : }
    5768             : 
    5769             : /************************************************************************/
    5770             : /*                   GDALTranspose2DComplexToComplex()                  */
    5771             : /************************************************************************/
    5772             : /**
    5773             :  * Transpose a 2D array of complex values into an array of complex values,
    5774             :  * in a efficient (cache-oblivious) way.
    5775             :  *
    5776             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    5777             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    5778             :  * @param nSrcWidth Width of pSrc array.
    5779             :  * @param nSrcHeight Height of pSrc array.
    5780             :  */
    5781             : template <class DST, class SRC>
    5782          25 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
    5783             :                                      DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    5784             :                                      size_t nSrcHeight)
    5785             : {
    5786          25 :     constexpr size_t blocksize = 32;
    5787          50 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    5788             :     {
    5789          25 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    5790          50 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    5791             :         {
    5792             :             // transpose the block beginning at [i,j]
    5793          25 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    5794          75 :             for (size_t k = i; k < max_k; ++k)
    5795             :             {
    5796         200 :                 for (size_t l = j; l < max_l; ++l)
    5797             :                 {
    5798         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    5799         150 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    5800         150 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
    5801         150 :                                  pDst[2 * (k + l * nSrcHeight) + 1]);
    5802             :                 }
    5803             :             }
    5804             :         }
    5805             :     }
    5806          25 : }
    5807             : 
    5808             : /************************************************************************/
    5809             : /*                   GDALTranspose2DComplexToSingle()                  */
    5810             : /************************************************************************/
    5811             : /**
    5812             :  * Transpose a 2D array of complex values into an array of non-complex values,
    5813             :  * in a efficient (cache-oblivious) way.
    5814             :  *
    5815             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    5816             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    5817             :  * @param nSrcWidth Width of pSrc array.
    5818             :  * @param nSrcHeight Height of pSrc array.
    5819             :  */
    5820             : template <class DST, class SRC>
    5821          55 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
    5822             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    5823             :                                     size_t nSrcHeight)
    5824             : {
    5825          55 :     constexpr size_t blocksize = 32;
    5826         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    5827             :     {
    5828          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    5829         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    5830             :         {
    5831             :             // transpose the block beginning at [i,j]
    5832          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    5833         165 :             for (size_t k = i; k < max_k; ++k)
    5834             :             {
    5835         440 :                 for (size_t l = j; l < max_l; ++l)
    5836             :                 {
    5837         330 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    5838         330 :                                  pDst[k + l * nSrcHeight]);
    5839             :                 }
    5840             :             }
    5841             :         }
    5842             :     }
    5843          55 : }
    5844             : 
    5845             : /************************************************************************/
    5846             : /*                   GDALTranspose2DSingleToComplex()                  */
    5847             : /************************************************************************/
    5848             : /**
    5849             :  * Transpose a 2D array of non-complex values into an array of complex values,
    5850             :  * in a efficient (cache-oblivious) way.
    5851             :  *
    5852             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    5853             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    5854             :  * @param nSrcWidth Width of pSrc array.
    5855             :  * @param nSrcHeight Height of pSrc array.
    5856             :  */
    5857             : template <class DST, class SRC>
    5858          55 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
    5859             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    5860             :                                     size_t nSrcHeight)
    5861             : {
    5862          55 :     constexpr size_t blocksize = 32;
    5863         110 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    5864             :     {
    5865          55 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    5866         110 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    5867             :         {
    5868             :             // transpose the block beginning at [i,j]
    5869          55 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    5870         165 :             for (size_t k = i; k < max_k; ++k)
    5871             :             {
    5872         440 :                 for (size_t l = j; l < max_l; ++l)
    5873             :                 {
    5874         330 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    5875         330 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    5876         330 :                     pDst[2 * (k + l * nSrcHeight) + 1] = 0;
    5877             :                 }
    5878             :             }
    5879             :         }
    5880             :     }
    5881          55 : }
    5882             : 
    5883             : /************************************************************************/
    5884             : /*                        GDALTranspose2D()                             */
    5885             : /************************************************************************/
    5886             : 
    5887             : template <class DST, bool DST_IS_COMPLEX>
    5888         280 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
    5889             :                             size_t nSrcWidth, size_t nSrcHeight)
    5890             : {
    5891             : #define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
    5892             :     do                                                                         \
    5893             :     {                                                                          \
    5894             :         if constexpr (DST_IS_COMPLEX)                                          \
    5895             :         {                                                                      \
    5896             :             GDALTranspose2DSingleToComplex(                                    \
    5897             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    5898             :                 nSrcHeight);                                                   \
    5899             :         }                                                                      \
    5900             :         else                                                                   \
    5901             :         {                                                                      \
    5902             :             GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
    5903             :                                           pDst, nSrcWidth, nSrcHeight);        \
    5904             :         }                                                                      \
    5905             :     } while (0)
    5906             : 
    5907             : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
    5908             :     do                                                                         \
    5909             :     {                                                                          \
    5910             :         if constexpr (DST_IS_COMPLEX)                                          \
    5911             :         {                                                                      \
    5912             :             GDALTranspose2DComplexToComplex(                                   \
    5913             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    5914             :                 nSrcHeight);                                                   \
    5915             :         }                                                                      \
    5916             :         else                                                                   \
    5917             :         {                                                                      \
    5918             :             GDALTranspose2DComplexToSingle(                                    \
    5919             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    5920             :                 nSrcHeight);                                                   \
    5921             :         }                                                                      \
    5922             :     } while (0)
    5923             : 
    5924             :     // clang-format off
    5925         280 :     switch (eSrcType)
    5926             :     {
    5927          16 :         case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t); break;
    5928          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
    5929          24 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
    5930          16 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
    5931          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
    5932          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
    5933          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
    5934          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
    5935          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16); break;
    5936          17 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
    5937          24 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
    5938          16 :         case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
    5939          16 :         case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
    5940          16 :         case GDT_CFloat16: CALL_GDALTranspose2DComplex_internal(GFloat16); break;
    5941          16 :         case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
    5942          16 :         case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
    5943           0 :         case GDT_Unknown:
    5944             :         case GDT_TypeCount:
    5945           0 :             break;
    5946             :     }
    5947             :         // clang-format on
    5948             : 
    5949             : #undef CALL_GDALTranspose2D_internal
    5950             : #undef CALL_GDALTranspose2DComplex_internal
    5951         280 : }
    5952             : 
    5953             : /************************************************************************/
    5954             : /*                      GDALInterleave2Byte()                           */
    5955             : /************************************************************************/
    5956             : 
    5957             : #if defined(HAVE_SSE2) &&                                                      \
    5958             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    5959             : 
    5960             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    5961             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    5962             : #if defined(__GNUC__)
    5963             : __attribute__((noinline))
    5964             : #endif
    5965             : static void
    5966             : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    5967             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    5968             : {
    5969             :     size_t i = 0;
    5970             :     constexpr size_t VALS_PER_ITER = 16;
    5971             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    5972             :     {
    5973             :         __m128i xmm0 =
    5974             :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
    5975             :         __m128i xmm1 = _mm_loadu_si128(
    5976             :             reinterpret_cast<__m128i const *>(pSrc + i + nIters));
    5977             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
    5978             :                          _mm_unpacklo_epi8(xmm0, xmm1));
    5979             :         _mm_storeu_si128(
    5980             :             reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
    5981             :             _mm_unpackhi_epi8(xmm0, xmm1));
    5982             :     }
    5983             : #if defined(__clang__)
    5984             : #pragma clang loop vectorize(disable)
    5985             : #endif
    5986             :     for (; i < nIters; ++i)
    5987             :     {
    5988             :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    5989             :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    5990             :     }
    5991             : }
    5992             : 
    5993             : #else
    5994             : 
    5995             : #if defined(__GNUC__) && !defined(__clang__)
    5996             : __attribute__((optimize("tree-vectorize")))
    5997             : #endif
    5998             : #if defined(__GNUC__)
    5999             : __attribute__((noinline))
    6000             : #endif
    6001             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6002             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6003             : #pragma clang diagnostic push
    6004             : #pragma clang diagnostic ignored "-Wpass-failed"
    6005             : #endif
    6006             : static void
    6007           4 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    6008             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6009             : {
    6010             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6011             : #pragma clang loop vectorize(enable)
    6012             : #endif
    6013          44 :     for (size_t i = 0; i < nIters; ++i)
    6014             :     {
    6015          40 :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    6016          40 :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    6017             :     }
    6018           4 : }
    6019             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6020             : #pragma clang diagnostic pop
    6021             : #endif
    6022             : 
    6023             : #endif
    6024             : 
    6025             : /************************************************************************/
    6026             : /*                      GDALInterleave4Byte()                           */
    6027             : /************************************************************************/
    6028             : 
    6029             : #if defined(HAVE_SSE2) &&                                                      \
    6030             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    6031             : 
    6032             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    6033             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    6034             : #if defined(__GNUC__)
    6035             : __attribute__((noinline))
    6036             : #endif
    6037             : static void
    6038             : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6039             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6040             : {
    6041             :     size_t i = 0;
    6042             :     constexpr size_t VALS_PER_ITER = 16;
    6043             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    6044             :     {
    6045             :         __m128i xmm0 = _mm_loadu_si128(
    6046             :             reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
    6047             :         __m128i xmm1 = _mm_loadu_si128(
    6048             :             reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
    6049             :         __m128i xmm2 = _mm_loadu_si128(
    6050             :             reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
    6051             :         __m128i xmm3 = _mm_loadu_si128(
    6052             :             reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
    6053             :         auto tmp0 = _mm_unpacklo_epi8(
    6054             :             xmm0,
    6055             :             xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
    6056             :         auto tmp1 = _mm_unpackhi_epi8(
    6057             :             xmm0,
    6058             :             xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
    6059             :         auto tmp2 = _mm_unpacklo_epi8(
    6060             :             xmm2,
    6061             :             xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
    6062             :         auto tmp3 = _mm_unpackhi_epi8(
    6063             :             xmm2,
    6064             :             xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
    6065             :         auto tmp2_0 = _mm_unpacklo_epi16(
    6066             :             tmp0,
    6067             :             tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
    6068             :         auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
    6069             :         auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
    6070             :         auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
    6071             :         _mm_storeu_si128(
    6072             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
    6073             :             tmp2_0);
    6074             :         _mm_storeu_si128(
    6075             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
    6076             :             tmp2_1);
    6077             :         _mm_storeu_si128(
    6078             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
    6079             :             tmp2_2);
    6080             :         _mm_storeu_si128(
    6081             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
    6082             :             tmp2_3);
    6083             :     }
    6084             : #if defined(__clang__)
    6085             : #pragma clang loop vectorize(disable)
    6086             : #endif
    6087             :     for (; i < nIters; ++i)
    6088             :     {
    6089             :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    6090             :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    6091             :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    6092             :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    6093             :     }
    6094             : }
    6095             : 
    6096             : #else
    6097             : 
    6098             : #if defined(__GNUC__) && !defined(__clang__)
    6099             : __attribute__((optimize("tree-vectorize")))
    6100             : #endif
    6101             : #if defined(__GNUC__)
    6102             : __attribute__((noinline))
    6103             : #endif
    6104             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6105             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6106             : #pragma clang diagnostic push
    6107             : #pragma clang diagnostic ignored "-Wpass-failed"
    6108             : #endif
    6109             : static void
    6110           2 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6111             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6112             : {
    6113             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6114             : #pragma clang loop vectorize(enable)
    6115             : #endif
    6116          36 :     for (size_t i = 0; i < nIters; ++i)
    6117             :     {
    6118          34 :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    6119          34 :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    6120          34 :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    6121          34 :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    6122             :     }
    6123           2 : }
    6124             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6125             : #pragma clang diagnostic pop
    6126             : #endif
    6127             : 
    6128             : #endif
    6129             : 
    6130             : /************************************************************************/
    6131             : /*                        GDALTranspose2D()                             */
    6132             : /************************************************************************/
    6133             : 
    6134             : /**
    6135             :  * Transpose a 2D array in a efficient (cache-oblivious) way.
    6136             :  *
    6137             :  * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
    6138             :  * @param eSrcType Data type of pSrc.
    6139             :  * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
    6140             :  * @param eDstType Data type of pDst.
    6141             :  * @param nSrcWidth Width of pSrc array.
    6142             :  * @param nSrcHeight Height of pSrc array.
    6143             :  * @since GDAL 3.11
    6144             :  */
    6145             : 
    6146         305 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
    6147             :                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
    6148             : {
    6149         305 :     if (eSrcType == eDstType && (eSrcType == GDT_Byte || eSrcType == GDT_Int8))
    6150             :     {
    6151          25 :         if (nSrcHeight == 2)
    6152             :         {
    6153           4 :             GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
    6154             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    6155           4 :             return;
    6156             :         }
    6157          21 :         if (nSrcHeight == 4)
    6158             :         {
    6159           2 :             GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
    6160             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    6161           2 :             return;
    6162             :         }
    6163             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
    6164             :      (defined(__x86_64) || defined(_M_X64)))
    6165          19 :         if (CPLHaveRuntimeSSSE3())
    6166             :         {
    6167          19 :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    6168             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    6169             :                                        nSrcHeight);
    6170          19 :             return;
    6171             :         }
    6172             : #elif defined(USE_NEON_OPTIMIZATIONS)
    6173             :         {
    6174             :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    6175             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    6176             :                                        nSrcHeight);
    6177             :             return;
    6178             :         }
    6179             : #endif
    6180             :     }
    6181             : 
    6182             : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
    6183             :     GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
    6184             :         pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
    6185             : 
    6186             :     // clang-format off
    6187         280 :     switch (eDstType)
    6188             :     {
    6189          15 :         case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
    6190          15 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
    6191          24 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
    6192          16 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
    6193          24 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
    6194          16 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
    6195          16 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
    6196          16 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
    6197          16 :         case GDT_Float16:  CALL_GDALTranspose2D_internal(GFloat16, false); break;
    6198          17 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
    6199          25 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
    6200          16 :         case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
    6201          16 :         case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
    6202          16 :         case GDT_CFloat16: CALL_GDALTranspose2D_internal(GFloat16, true); break;
    6203          16 :         case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
    6204          16 :         case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
    6205           0 :         case GDT_Unknown:
    6206             :         case GDT_TypeCount:
    6207           0 :             break;
    6208             :     }
    6209             :         // clang-format on
    6210             : 
    6211             : #undef CALL_GDALTranspose2D_internal
    6212             : }
    6213             : 
    6214             : /************************************************************************/
    6215             : /*                     ExtractBitAndConvertTo255()                      */
    6216             : /************************************************************************/
    6217             : 
    6218             : #if defined(__GNUC__) || defined(_MSC_VER)
    6219             : // Signedness of char implementation dependent, so be explicit.
    6220             : // Assumes 2-complement integer types and sign extension of right shifting
    6221             : // GCC guarantees such:
    6222             : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
    6223      157290 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    6224             : {
    6225      157290 :     return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
    6226      157290 :                               7);
    6227             : }
    6228             : #else
    6229             : // Portable way
    6230             : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    6231             : {
    6232             :     return (byVal & (1 << nBit)) ? 255 : 0;
    6233             : }
    6234             : #endif
    6235             : 
    6236             : /************************************************************************/
    6237             : /*                   ExpandEightPackedBitsToByteAt255()                 */
    6238             : /************************************************************************/
    6239             : 
    6240       19457 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
    6241             :                                                     GByte abyOutput[8])
    6242             : {
    6243       19457 :     abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
    6244       19457 :     abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
    6245       19457 :     abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
    6246       19457 :     abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
    6247       19457 :     abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
    6248       19457 :     abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
    6249       19457 :     abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
    6250       19457 :     abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
    6251       19457 : }
    6252             : 
    6253             : /************************************************************************/
    6254             : /*                GDALExpandPackedBitsToByteAt0Or255()                  */
    6255             : /************************************************************************/
    6256             : 
    6257             : /** Expand packed-bits (ordered from most-significant bit to least one)
    6258             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    6259             :   at 1 to a byte at 255.
    6260             : 
    6261             :  The function does (in a possibly more optimized way) the following:
    6262             :  \code{.cpp}
    6263             :  for (size_t i = 0; i < nInputBits; ++i )
    6264             :  {
    6265             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
    6266             :  }
    6267             :  \endcode
    6268             : 
    6269             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    6270             :  @param pabyOutput Output array of nInputBits bytes.
    6271             :  @param nInputBits Number of valid bits in pabyInput.
    6272             : 
    6273             :  @since 3.11
    6274             : */
    6275             : 
    6276       44445 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
    6277             :                                         GByte *CPL_RESTRICT pabyOutput,
    6278             :                                         size_t nInputBits)
    6279             : {
    6280       44445 :     const size_t nInputWholeBytes = nInputBits / 8;
    6281       44445 :     size_t iByte = 0;
    6282             : 
    6283             : #ifdef HAVE_SSE2
    6284             :     // Mask to isolate each bit
    6285       44445 :     const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
    6286             :                                           8, 16, 32, 64, -128);
    6287       44445 :     const __m128i zero = _mm_setzero_si128();
    6288       44445 :     const __m128i all_ones = _mm_set1_epi8(-1);
    6289             : #ifdef __SSSE3__
    6290             :     const __m128i dispatch_two_bytes =
    6291             :         _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
    6292             : #endif
    6293       44445 :     constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
    6294      132394 :     for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
    6295             :     {
    6296       87949 :         __m128i reg_ori = _mm_loadu_si128(
    6297       87949 :             reinterpret_cast<const __m128i *>(pabyInput + iByte));
    6298             : 
    6299       87949 :         constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
    6300      791541 :         for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
    6301             :         {
    6302             :             // Given reg_ori = (A, B, ... 14 other bytes ...),
    6303             :             // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
    6304             : #ifdef __SSSE3__
    6305             :             __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
    6306             : #else
    6307      703592 :             __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
    6308      703592 :             reg = _mm_unpacklo_epi16(reg, reg);
    6309      703592 :             reg = _mm_unpacklo_epi32(reg, reg);
    6310             : #endif
    6311             : 
    6312             :             // Test if bits of interest are set
    6313      703592 :             reg = _mm_and_si128(reg, bit_mask);
    6314             : 
    6315             :             // Now test if those bits are set, by comparing to zero. So the
    6316             :             // result will be that bytes where bits are set will be at 0, and
    6317             :             // ones where they are cleared will be at 0xFF. So the inverse of
    6318             :             // the end result we want!
    6319      703592 :             reg = _mm_cmpeq_epi8(reg, zero);
    6320             : 
    6321             :             // Invert the result
    6322      703592 :             reg = _mm_andnot_si128(reg, all_ones);
    6323             : 
    6324             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
    6325             : 
    6326      703592 :             pabyOutput += SSE_REG_SIZE;
    6327             : 
    6328             :             // Right-shift of 2 bytes
    6329      703592 :             reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
    6330             :         }
    6331             :     }
    6332             : 
    6333             : #endif  // HAVE_SSE2
    6334             : 
    6335       63902 :     for (; iByte < nInputWholeBytes; ++iByte)
    6336             :     {
    6337       19457 :         ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
    6338       19457 :         pabyOutput += 8;
    6339             :     }
    6340       46079 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    6341             :     {
    6342        1634 :         *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
    6343        1634 :         ++pabyOutput;
    6344             :     }
    6345       44445 : }
    6346             : 
    6347             : /************************************************************************/
    6348             : /*                   ExpandEightPackedBitsToByteAt1()                   */
    6349             : /************************************************************************/
    6350             : 
    6351      136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
    6352             :                                                   GByte abyOutput[8])
    6353             : {
    6354      136113 :     abyOutput[0] = (byVal >> 7) & 0x1;
    6355      136113 :     abyOutput[1] = (byVal >> 6) & 0x1;
    6356      136113 :     abyOutput[2] = (byVal >> 5) & 0x1;
    6357      136113 :     abyOutput[3] = (byVal >> 4) & 0x1;
    6358      136113 :     abyOutput[4] = (byVal >> 3) & 0x1;
    6359      136113 :     abyOutput[5] = (byVal >> 2) & 0x1;
    6360      136113 :     abyOutput[6] = (byVal >> 1) & 0x1;
    6361      136113 :     abyOutput[7] = (byVal >> 0) & 0x1;
    6362      136113 : }
    6363             : 
    6364             : /************************************************************************/
    6365             : /*                GDALExpandPackedBitsToByteAt0Or1()                    */
    6366             : /************************************************************************/
    6367             : 
    6368             : /** Expand packed-bits (ordered from most-significant bit to least one)
    6369             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    6370             :   at 1 to a byte at 1.
    6371             : 
    6372             :  The function does (in a possibly more optimized way) the following:
    6373             :  \code{.cpp}
    6374             :  for (size_t i = 0; i < nInputBits; ++i )
    6375             :  {
    6376             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
    6377             :  }
    6378             :  \endcode
    6379             : 
    6380             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    6381             :  @param pabyOutput Output array of nInputBits bytes.
    6382             :  @param nInputBits Number of valid bits in pabyInput.
    6383             : 
    6384             :  @since 3.11
    6385             : */
    6386             : 
    6387        7041 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
    6388             :                                       GByte *CPL_RESTRICT pabyOutput,
    6389             :                                       size_t nInputBits)
    6390             : {
    6391        7041 :     const size_t nInputWholeBytes = nInputBits / 8;
    6392        7041 :     size_t iByte = 0;
    6393      143154 :     for (; iByte < nInputWholeBytes; ++iByte)
    6394             :     {
    6395      136113 :         ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
    6396      136113 :         pabyOutput += 8;
    6397             :     }
    6398       18902 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    6399             :     {
    6400       11861 :         *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
    6401       11861 :         ++pabyOutput;
    6402             :     }
    6403        7041 : }

Generated by: LCOV version 1.14