LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2214 2436 90.9 %
Date: 2024-11-21 22:18:42 Functions: 339 371 91.4 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_progress.h"
      36             : #include "cpl_string.h"
      37             : #include "cpl_vsi.h"
      38             : #include "gdal_priv_templates.hpp"
      39             : #include "gdal_vrt.h"
      40             : #include "gdalwarper.h"
      41             : #include "memdataset.h"
      42             : #include "vrtdataset.h"
      43             : 
      44             : #if defined(__x86_64) || defined(_M_X64)
      45             : #include <emmintrin.h>
      46             : #define HAVE_SSE2
      47             : #elif defined(USE_NEON_OPTIMIZATIONS)
      48             : #include "include_sse2neon.h"
      49             : #define HAVE_SSE2
      50             : #endif
      51             : 
      52             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      53             : #include "rasterio_ssse3.h"
      54             : #endif
      55             : 
      56             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      57             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      58             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      59             : 
      60             : /************************************************************************/
      61             : /*                    DownsamplingIntegerXFactor()                      */
      62             : /************************************************************************/
      63             : 
      64             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      65      413252 : static bool DownsamplingIntegerXFactor(
      66             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      67             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      68             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      69             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      70             : {
      71      413252 :     const int nBandDataSize =
      72             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      73      413252 :     int nOuterLoopIters = nBufXSize - 1;
      74      413252 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      75             :     const GByte *CPL_RESTRICT pabySrcData;
      76      413252 :     int nEndBlockX = nBlockXSize + nStartBlockX;
      77             : 
      78      413252 :     if (iSrcX < nEndBlockX)
      79             :     {
      80      226152 :         CPLAssert(poBlock);
      81      226152 :         goto no_reload_block;
      82             :     }
      83      187100 :     goto reload_block;
      84             : 
      85             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
      86             :     // nRasterXSize - 1
      87      932888 :     while (--nOuterLoopIters >= 1)
      88             :     {
      89      189034 :         iSrcX += nSrcXInc;
      90      189034 :         pabySrcData += nIncSrcOffset;
      91      189034 :         pabyDstData += nPixelSpace;
      92             : 
      93             :         /* --------------------------------------------------------------------
      94             :          */
      95             :         /*      Ensure we have the appropriate block loaded. */
      96             :         /* --------------------------------------------------------------------
      97             :          */
      98      189034 :         if (iSrcX >= nEndBlockX)
      99             :         {
     100      189034 :         reload_block:
     101             :         {
     102      388724 :             const int nLBlockX = iSrcX / nBlockXSize;
     103      388724 :             nStartBlockX = nLBlockX * nBlockXSize;
     104      388724 :             nEndBlockX = nStartBlockX + nBlockXSize;
     105             : 
     106      388724 :             if (poBlock != nullptr)
     107      316739 :                 poBlock->DropLock();
     108             : 
     109      388724 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     110      388724 :             if (poBlock == nullptr)
     111             :             {
     112           1 :                 return false;
     113             :             }
     114             :         }
     115             : 
     116      388723 :         no_reload_block:
     117             :             const GByte *pabySrcBlock =
     118      932888 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     119      932888 :             GPtrDiff_t iSrcOffset =
     120      932888 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     121      932888 :             pabySrcData = pabySrcBlock + iSrcOffset;
     122             :         }
     123             : 
     124             :         /* --------------------------------------------------------------------
     125             :          */
     126             :         /*      Copy the maximum run of pixels. */
     127             :         /* --------------------------------------------------------------------
     128             :          */
     129             : 
     130      932888 :         const int nIters = std::min(
     131      932888 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     132             :         if (bSameDataType)
     133             :         {
     134      932483 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     135      932483 :             if (nIters > 1)
     136             :             {
     137             :                 if (DATA_TYPE_SIZE == 1)
     138             :                 {
     139      276307 :                     pabySrcData += nIncSrcOffset;
     140      276307 :                     pabyDstData += nPixelSpace;
     141      276307 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     142      276307 :                                      nPixelSpace, nIters - 1);
     143      276307 :                     pabySrcData +=
     144      276307 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     145      276307 :                     pabyDstData +=
     146      276307 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     147             :                 }
     148             :                 else
     149             :                 {
     150     4443828 :                     for (int i = 0; i < nIters - 1; i++)
     151             :                     {
     152     4245254 :                         pabySrcData += nIncSrcOffset;
     153     4245254 :                         pabyDstData += nPixelSpace;
     154     4245254 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     155             :                     }
     156             :                 }
     157      474881 :                 iSrcX += nSrcXInc * (nIters - 1);
     158      474881 :                 nOuterLoopIters -= nIters - 1;
     159             :             }
     160             :         }
     161             :         else
     162             :         {
     163             :             // Type to type conversion ...
     164         405 :             GDALCopyWords(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     165         405 :                           eBufType, nPixelSpace, std::max(1, nIters));
     166         405 :             if (nIters > 1)
     167             :             {
     168         198 :                 pabySrcData +=
     169         198 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     170         198 :                 pabyDstData +=
     171         198 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     172         198 :                 iSrcX += nSrcXInc * (nIters - 1);
     173         198 :                 nOuterLoopIters -= nIters - 1;
     174             :             }
     175             :         }
     176             :     }
     177             : 
     178             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     179      743854 :     if (nOuterLoopIters == 0)
     180             :     {
     181      330603 :         const int nRasterXSize = poBand->GetXSize();
     182      330603 :         iSrcX =
     183      661206 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     184      330603 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     185      330603 :         pabyDstData += nPixelSpace;
     186      330603 :         if (iSrcX < nEndBlockX)
     187             :         {
     188      318013 :             goto no_reload_block;
     189             :         }
     190       12590 :         goto reload_block;
     191             :     }
     192      413251 :     return true;
     193             : }
     194             : 
     195             : /************************************************************************/
     196             : /*                             IRasterIO()                              */
     197             : /*                                                                      */
     198             : /*      Default internal implementation of RasterIO() ... utilizes      */
     199             : /*      the Block access methods to satisfy the request.  This would    */
     200             : /*      normally only be overridden by formats with overviews.          */
     201             : /************************************************************************/
     202             : 
     203     5686630 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     204             :                                  int nXSize, int nYSize, void *pData,
     205             :                                  int nBufXSize, int nBufYSize,
     206             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     207             :                                  GSpacing nLineSpace,
     208             :                                  GDALRasterIOExtraArg *psExtraArg)
     209             : 
     210             : {
     211     5686630 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     212             :     {
     213           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     214             :                  "An error occurred while writing a dirty block "
     215             :                  "from GDALRasterBand::IRasterIO");
     216           0 :         CPLErr eErr = eFlushBlockErr;
     217           0 :         eFlushBlockErr = CE_None;
     218           0 :         return eErr;
     219             :     }
     220     5686630 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     221             :     {
     222        1987 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     223           0 :         return CE_Failure;
     224             :     }
     225             : 
     226     5684640 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     227     5684640 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     228     5682700 :     GByte dummyBlock[2] = {0, 0};
     229     5682700 :     GByte *pabySrcBlock =
     230             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     231     5682700 :     GDALRasterBlock *poBlock = nullptr;
     232     5682700 :     const bool bUseIntegerRequestCoords =
     233     5721960 :         (!psExtraArg->bFloatingPointWindowValidity ||
     234       39263 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     235       15903 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     236             : 
     237             :     /* ==================================================================== */
     238             :     /*      A common case is the data requested with the destination        */
     239             :     /*      is packed, and the block width is the raster width.             */
     240             :     /* ==================================================================== */
     241     5604230 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     242     2926780 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     243    11286100 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     244             :     {
     245     2793070 :         CPLErr eErr = CE_None;
     246     2793070 :         int nLBlockY = -1;
     247             : 
     248     8425560 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     249             :         {
     250     5628170 :             const int iSrcY = iBufYOff + nYOff;
     251             : 
     252     5628170 :             if (iSrcY < nLBlockY * nBlockYSize ||
     253     5627410 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     254             :             {
     255     3033080 :                 nLBlockY = iSrcY / nBlockYSize;
     256     3033080 :                 bool bJustInitialize =
     257       95893 :                     eRWFlag == GF_Write && nXOff == 0 &&
     258     3180410 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     259       51439 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     260             : 
     261             :                 // Is this a partial tile at right and/or bottom edges of
     262             :                 // the raster, and that is going to be completely written?
     263             :                 // If so, do not load it from storage, but zero it so that
     264             :                 // the content outsize of the validity area is initialized.
     265     3033080 :                 bool bMemZeroBuffer = false;
     266       95893 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     267       21147 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     268     3129060 :                     nYOff + nYSize == GetYSize() &&
     269          89 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     270             :                 {
     271          89 :                     bJustInitialize = true;
     272          89 :                     bMemZeroBuffer = true;
     273             :                 }
     274             : 
     275     3033080 :                 if (poBlock)
     276      237292 :                     poBlock->DropLock();
     277             : 
     278     3033080 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     279     3029700 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     280     3034860 :                 if (poBlock == nullptr)
     281             :                 {
     282        1066 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     283             :                         nullptr)
     284             :                     {
     285           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     286             :                                  "GetBlockRef failed at X block offset %d, "
     287             :                                  "Y block offset %d%s",
     288             :                                  0, nLBlockY,
     289           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     290           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     291             :                                      : "");
     292             :                     }
     293        1066 :                     eErr = CE_Failure;
     294        1066 :                     break;
     295             :                 }
     296             : 
     297     3033800 :                 if (eRWFlag == GF_Write)
     298       95893 :                     poBlock->MarkDirty();
     299             : 
     300     3033800 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     301     3033740 :                 if (bMemZeroBuffer)
     302             :                 {
     303          89 :                     memset(pabySrcBlock, 0,
     304          89 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     305          89 :                                nBlockXSize * nBlockYSize);
     306             :                 }
     307             :             }
     308             : 
     309     5628830 :             const auto nSrcByteOffset =
     310     5628830 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     311     5628830 :                      nBlockXSize +
     312     5628830 :                  nXOff) *
     313     5628830 :                 nBandDataSize;
     314             : 
     315     5628830 :             if (eDataType == eBufType)
     316             :             {
     317     1731600 :                 if (eRWFlag == GF_Read)
     318     1489880 :                     memcpy(static_cast<GByte *>(pData) +
     319     1489880 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     320     1489880 :                            pabySrcBlock + nSrcByteOffset,
     321             :                            static_cast<size_t>(nLineSpace));
     322             :                 else
     323      241713 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     324      241713 :                            static_cast<GByte *>(pData) +
     325      241713 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     326             :                            static_cast<size_t>(nLineSpace));
     327             :             }
     328             :             else
     329             :             {
     330             :                 // Type to type conversion.
     331             : 
     332     3897240 :                 if (eRWFlag == GF_Read)
     333     3882700 :                     GDALCopyWords(
     334     3882700 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     335             :                         static_cast<GByte *>(pData) +
     336     3882700 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     337             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     338             :                 else
     339       14531 :                     GDALCopyWords(static_cast<GByte *>(pData) +
     340       14531 :                                       static_cast<GPtrDiff_t>(iBufYOff) *
     341             :                                           nLineSpace,
     342             :                                   eBufType, static_cast<int>(nPixelSpace),
     343       14531 :                                   pabySrcBlock + nSrcByteOffset, eDataType,
     344             :                                   nBandDataSize, nBufXSize);
     345             :             }
     346             : 
     347     5691520 :             if (psExtraArg->pfnProgress != nullptr &&
     348       59019 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     349             :                                          psExtraArg->pProgressData))
     350             :             {
     351           5 :                 eErr = CE_Failure;
     352           5 :                 break;
     353             :             }
     354             :         }
     355             : 
     356     2798460 :         if (poBlock)
     357     2796260 :             poBlock->DropLock();
     358             : 
     359     2797290 :         return eErr;
     360             :     }
     361             : 
     362             :     /* ==================================================================== */
     363             :     /*      Do we have overviews that would be appropriate to satisfy       */
     364             :     /*      this request?                                                   */
     365             :     /* ==================================================================== */
     366     2888840 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     367             :         eRWFlag == GF_Read)
     368             :     {
     369             :         GDALRasterIOExtraArg sExtraArg;
     370        2832 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     371             : 
     372             :         const int nOverview =
     373        2832 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     374             :                                           nBufXSize, nBufYSize, &sExtraArg);
     375        2832 :         if (nOverview >= 0)
     376             :         {
     377        2812 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     378        2812 :             if (poOverviewBand == nullptr)
     379        2812 :                 return CE_Failure;
     380             : 
     381        2812 :             return poOverviewBand->RasterIO(
     382             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     383        2812 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     384             :         }
     385             :     }
     386             : 
     387      699328 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     388           0 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     389     3587280 :         nLineSpace == nPixelSpace * nBufXSize &&
     390           0 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     391             :     {
     392           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     393           0 :         return CE_None;
     394             :     }
     395             : 
     396             :     /* ==================================================================== */
     397             :     /*      The second case when we don't need subsample data but likely    */
     398             :     /*      need data type conversion.                                      */
     399             :     /* ==================================================================== */
     400     2887950 :     if (  // nPixelSpace == nBufDataSize &&
     401     2887950 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     402             :     {
     403             : #if DEBUG_VERBOSE
     404             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     405             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     406             : #endif
     407             : 
     408             :         /* --------------------------------------------------------------------
     409             :          */
     410             :         /*      Loop over buffer computing source locations. */
     411             :         /* --------------------------------------------------------------------
     412             :          */
     413             :         // Calculate starting values out of loop
     414     2525080 :         const int nLBlockXStart = nXOff / nBlockXSize;
     415     2525080 :         const int nXSpanEnd = nBufXSize + nXOff;
     416             : 
     417     2525080 :         int nYInc = 0;
     418     5084770 :         for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
     419     2559690 :              iBufYOff += nYInc, iSrcY += nYInc)
     420             :         {
     421     2559640 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     422             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     423     2559640 :             int nLBlockY = iSrcY / nBlockYSize;
     424     2559640 :             int nLBlockX = nLBlockXStart;
     425     2559640 :             int iSrcX = nXOff;
     426     5330610 :             while (iSrcX < nXSpanEnd)
     427             :             {
     428     2770920 :                 int nXSpan = nLBlockX * nBlockXSize;
     429     2770920 :                 if (nXSpan < INT_MAX - nBlockXSize)
     430     2770920 :                     nXSpan += nBlockXSize;
     431             :                 else
     432           2 :                     nXSpan = INT_MAX;
     433     2770920 :                 const int nXRight = nXSpan;
     434     2770920 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     435     2770920 :                 const size_t nXSpanSize =
     436     2770920 :                     nXSpan * static_cast<size_t>(nPixelSpace);
     437             : 
     438     2770920 :                 bool bJustInitialize =
     439     2042040 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     440       37152 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     441     4838540 :                     nXOff <= nLBlockX * nBlockXSize &&
     442       25575 :                     nXOff + nXSize >= nXRight;
     443             : 
     444             :                 // Is this a partial tile at right and/or bottom edges of
     445             :                 // the raster, and that is going to be completely written?
     446             :                 // If so, do not load it from storage, but zero it so that
     447             :                 // the content outsize of the validity area is initialized.
     448     2770920 :                 bool bMemZeroBuffer = false;
     449     2042040 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     450     2017720 :                     nXOff <= nLBlockX * nBlockXSize &&
     451     2016100 :                     nYOff <= nLBlockY * nBlockYSize &&
     452       12074 :                     (nXOff + nXSize >= nXRight ||
     453             :                      // cppcheck-suppress knownConditionTrueFalse
     454     4815620 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     455       11896 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     456       10657 :                      (nYOff + nYSize == GetYSize() &&
     457        1870 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     458             :                 {
     459        3109 :                     bJustInitialize = true;
     460        3109 :                     bMemZeroBuffer = true;
     461             :                 }
     462             : 
     463             :                 /* --------------------------------------------------------------------
     464             :                  */
     465             :                 /*      Ensure we have the appropriate block loaded. */
     466             :                 /* --------------------------------------------------------------------
     467             :                  */
     468     2770920 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     469     2771080 :                 poBlock =
     470     2770840 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     471     2771080 :                 if (!poBlock)
     472             :                 {
     473          71 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     474             :                         nullptr)
     475             :                     {
     476           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     477             :                                  "GetBlockRef failed at X block offset %d, "
     478             :                                  "Y block offset %d%s",
     479             :                                  nLBlockX, nLBlockY,
     480           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     481           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     482             :                                      : "");
     483             :                     }
     484          71 :                     return (CE_Failure);
     485             :                 }
     486             : 
     487     2771000 :                 if (eRWFlag == GF_Write)
     488     2042040 :                     poBlock->MarkDirty();
     489             : 
     490     2771000 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     491     2771000 :                 if (bMemZeroBuffer)
     492             :                 {
     493        3109 :                     memset(pabySrcBlock, 0,
     494        3109 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     495        3109 :                                nBlockXSize * nBlockYSize);
     496             :                 }
     497             :                 /* --------------------------------------------------------------------
     498             :                  */
     499             :                 /*      Copy over this chunk of data. */
     500             :                 /* --------------------------------------------------------------------
     501             :                  */
     502     2771000 :                 GPtrDiff_t iSrcOffset =
     503     2771000 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     504     2771000 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     505     2771000 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     506     2771000 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     507     2771000 :                          nBlockXSize) *
     508     2771000 :                     nBandDataSize;
     509             :                 // Fill up as many rows as possible for the loaded block.
     510     5541990 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     511     2771000 :                                           nBufYSize - iBufYOff);
     512    58531700 :                 for (int k = 0; k < kmax; k++)
     513             :                 {
     514    55760900 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     515             :                     {
     516    51837100 :                         if (eRWFlag == GF_Read)
     517    47412500 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     518    47412500 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     519    47412500 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     520             :                         else
     521     4424610 :                             memcpy(pabySrcBlock + iSrcOffset,
     522     4424610 :                                    static_cast<GByte *>(pData) + iBufOffset +
     523     4424610 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     524             :                                    nXSpanSize);
     525             :                     }
     526             :                     else
     527             :                     {
     528             :                         /* type to type conversion */
     529     3923790 :                         if (eRWFlag == GF_Read)
     530     3899510 :                             GDALCopyWords(
     531     3899510 :                                 pabySrcBlock + iSrcOffset, eDataType,
     532             :                                 nBandDataSize,
     533     3899510 :                                 static_cast<GByte *>(pData) + iBufOffset +
     534     3899510 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     535             :                                 eBufType, static_cast<int>(nPixelSpace),
     536             :                                 nXSpan);
     537             :                         else
     538       24282 :                             GDALCopyWords(
     539       24282 :                                 static_cast<GByte *>(pData) + iBufOffset +
     540       24282 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     541             :                                 eBufType, static_cast<int>(nPixelSpace),
     542       24282 :                                 pabySrcBlock + iSrcOffset, eDataType,
     543             :                                 nBandDataSize, nXSpan);
     544             :                     }
     545             : 
     546    55760700 :                     iSrcOffset +=
     547    55760700 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     548             :                 }
     549             : 
     550             :                 iBufOffset =
     551     2770870 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     552     2770890 :                 nLBlockX++;
     553     2770890 :                 iSrcX += nXSpan;
     554             : 
     555     2770890 :                 poBlock->DropLock();
     556     2770970 :                 poBlock = nullptr;
     557             :             }
     558             : 
     559             :             /* Compute the increment to go on a block boundary */
     560     2559690 :             nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     561             : 
     562     2561450 :             if (psExtraArg->pfnProgress != nullptr &&
     563        1761 :                 !psExtraArg->pfnProgress(
     564     2561450 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     565             :                     psExtraArg->pProgressData))
     566             :             {
     567           0 :                 return CE_Failure;
     568             :             }
     569             :         }
     570             : 
     571     2525130 :         return CE_None;
     572             :     }
     573             : 
     574             :     /* ==================================================================== */
     575             :     /*      Loop reading required source blocks to satisfy output           */
     576             :     /*      request.  This is the most general implementation.              */
     577             :     /* ==================================================================== */
     578             : 
     579      362870 :     double dfXOff = nXOff;
     580      362870 :     double dfYOff = nYOff;
     581      362870 :     double dfXSize = nXSize;
     582      362870 :     double dfYSize = nYSize;
     583      362870 :     if (psExtraArg->bFloatingPointWindowValidity)
     584             :     {
     585       28160 :         dfXOff = psExtraArg->dfXOff;
     586       28160 :         dfYOff = psExtraArg->dfYOff;
     587       28160 :         dfXSize = psExtraArg->dfXSize;
     588       28160 :         dfYSize = psExtraArg->dfYSize;
     589             :     }
     590             : 
     591             :     /* -------------------------------------------------------------------- */
     592             :     /*      Compute stepping increment.                                     */
     593             :     /* -------------------------------------------------------------------- */
     594      362870 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     595      362870 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     596      362870 :     CPLErr eErr = CE_None;
     597             : 
     598      362870 :     if (eRWFlag == GF_Write)
     599             :     {
     600             :         /* --------------------------------------------------------------------
     601             :          */
     602             :         /*    Write case */
     603             :         /*    Loop over raster window computing source locations in the buffer.
     604             :          */
     605             :         /* --------------------------------------------------------------------
     606             :          */
     607      166650 :         GByte *pabyDstBlock = nullptr;
     608      166650 :         int nLBlockX = -1;
     609      166650 :         int nLBlockY = -1;
     610             : 
     611     1259590 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     612             :         {
     613     1092940 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     614             : 
     615    12063600 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     616             :             {
     617    10970600 :                 const int iBufXOff =
     618    10970600 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     619    10970600 :                 GPtrDiff_t iBufOffset =
     620    10970600 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     621             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     622    10970600 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     623             : 
     624             :                 // FIXME: this code likely doesn't work if the dirty block gets
     625             :                 // flushed to disk before being completely written.
     626             :                 // In the meantime, bJustInitialize should probably be set to
     627             :                 // FALSE even if it is not ideal performance wise, and for
     628             :                 // lossy compression.
     629             : 
     630             :                 /* --------------------------------------------------------------------
     631             :                  */
     632             :                 /*      Ensure we have the appropriate block loaded. */
     633             :                 /* --------------------------------------------------------------------
     634             :                  */
     635    10970600 :                 if (iDstX < nLBlockX * nBlockXSize ||
     636    10721300 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     637    10264600 :                     iDstY < nLBlockY * nBlockYSize ||
     638    10264600 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     639             :                 {
     640      738642 :                     nLBlockX = iDstX / nBlockXSize;
     641      738642 :                     nLBlockY = iDstY / nBlockYSize;
     642             : 
     643      738642 :                     const bool bJustInitialize =
     644     1065870 :                         nYOff <= nLBlockY * nBlockYSize &&
     645      327231 :                         nYOff + nYSize - nBlockYSize >=
     646      327231 :                             nLBlockY * nBlockYSize &&
     647     1116140 :                         nXOff <= nLBlockX * nBlockXSize &&
     648       50265 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     649             :                     /*bool bMemZeroBuffer = FALSE;
     650             :                     if( !bJustInitialize &&
     651             :                         nXOff <= nLBlockX * nBlockXSize &&
     652             :                         nYOff <= nLBlockY * nBlockYSize &&
     653             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     654             :                          (nXOff + nXSize == GetXSize() &&
     655             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     656             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     657             :                          (nYOff + nYSize == GetYSize() &&
     658             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     659             :                     {
     660             :                         bJustInitialize = TRUE;
     661             :                         bMemZeroBuffer = TRUE;
     662             :                     }*/
     663      738642 :                     if (poBlock != nullptr)
     664      571992 :                         poBlock->DropLock();
     665             : 
     666      738642 :                     poBlock =
     667      738642 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     668      738642 :                     if (poBlock == nullptr)
     669             :                     {
     670           0 :                         return (CE_Failure);
     671             :                     }
     672             : 
     673      738642 :                     poBlock->MarkDirty();
     674             : 
     675      738642 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     676             :                     /*if( bMemZeroBuffer )
     677             :                     {
     678             :                         memset(pabyDstBlock, 0,
     679             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     680             :                     * nBlockYSize);
     681             :                     }*/
     682             :                 }
     683             : 
     684             :                 // To make Coverity happy. Should not happen by design.
     685    10970600 :                 if (pabyDstBlock == nullptr)
     686             :                 {
     687           0 :                     CPLAssert(false);
     688             :                     eErr = CE_Failure;
     689             :                     break;
     690             :                 }
     691             : 
     692             :                 /* --------------------------------------------------------------------
     693             :                  */
     694             :                 /*      Copy over this pixel of data. */
     695             :                 /* --------------------------------------------------------------------
     696             :                  */
     697    10970600 :                 GPtrDiff_t iDstOffset =
     698    10970600 :                     (static_cast<GPtrDiff_t>(iDstX) -
     699    10970600 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     700    10970600 :                      (static_cast<GPtrDiff_t>(iDstY) -
     701    10970600 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     702    10970600 :                          nBlockXSize) *
     703    10970600 :                     nBandDataSize;
     704             : 
     705    10970600 :                 if (eDataType == eBufType)
     706             :                 {
     707    10967500 :                     memcpy(pabyDstBlock + iDstOffset,
     708    10967500 :                            static_cast<GByte *>(pData) + iBufOffset,
     709             :                            nBandDataSize);
     710             :                 }
     711             :                 else
     712             :                 {
     713             :                     /* type to type conversion ... ouch, this is expensive way
     714             :                     of handling single words */
     715             : 
     716        3096 :                     GDALCopyWords(static_cast<GByte *>(pData) + iBufOffset,
     717        3096 :                                   eBufType, 0, pabyDstBlock + iDstOffset,
     718             :                                   eDataType, 0, 1);
     719             :                 }
     720             :             }
     721             : 
     722     1092940 :             if (psExtraArg->pfnProgress != nullptr &&
     723           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     724             :                                          psExtraArg->pProgressData))
     725             :             {
     726           0 :                 eErr = CE_Failure;
     727           0 :                 break;
     728             :             }
     729             :         }
     730             :     }
     731             :     else
     732             :     {
     733      196220 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     734             :         {
     735        7612 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     736        2490 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     737        2488 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     738        5127 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     739        2455 :                 GetColorTable() != nullptr)
     740             :             {
     741           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     742             :                          "Resampling method not supported on paletted band. "
     743             :                          "Falling back to nearest neighbour");
     744             :             }
     745        2564 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     746           3 :                      GDALDataTypeIsComplex(eDataType))
     747             :             {
     748           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     749             :                          "Resampling method not supported on complex data type "
     750             :                          "band. Falling back to nearest neighbour");
     751             :             }
     752             :             else
     753             :             {
     754        2561 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     755             :                                          pData, nBufXSize, nBufYSize, eBufType,
     756        2561 :                                          nPixelSpace, nLineSpace, psExtraArg);
     757             :             }
     758             :         }
     759             : 
     760      193619 :         int nLimitBlockY = 0;
     761      193619 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     762      193619 :         int nStartBlockX = -nBlockXSize;
     763      193619 :         const double EPS = 1e-10;
     764      193619 :         int nLBlockY = -1;
     765      193619 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     766      193619 :         const bool bIntegerXFactor =
     767      170972 :             bUseIntegerRequestCoords &&
     768      265604 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     769       71985 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     770             : 
     771             :         /* --------------------------------------------------------------------
     772             :          */
     773             :         /*      Read case */
     774             :         /*      Loop over buffer computing source locations. */
     775             :         /* --------------------------------------------------------------------
     776             :          */
     777     1955240 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     778             :         {
     779             :             // Add small epsilon to avoid some numeric precision issues.
     780     1761630 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     781     1761630 :             const int iSrcY = static_cast<int>(std::min(
     782     1761630 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     783             : 
     784     1761630 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     785             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     786             : 
     787     1761630 :             if (iSrcY >= nLimitBlockY)
     788             :             {
     789      234813 :                 nLBlockY = iSrcY / nBlockYSize;
     790      234813 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     791      234813 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     792      234813 :                     nLimitBlockY += nBlockYSize;
     793             :                 else
     794           0 :                     nLimitBlockY = INT_MAX;
     795             :                 // Make sure a new block is loaded.
     796      234813 :                 nStartBlockX = -nBlockXSize;
     797             :             }
     798     1526820 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     799             :             {
     800             :                 // Make sure a new block is loaded.
     801      429795 :                 nStartBlockX = -nBlockXSize;
     802             :             }
     803             : 
     804     1761630 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     805     1761630 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     806             : 
     807     1761630 :             if (bIntegerXFactor)
     808             :             {
     809      413252 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     810      413252 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     811      413252 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     812      413252 :                 bool bRet = false;
     813      413252 :                 if (bByteCopy)
     814             :                 {
     815      302865 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     816             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     817             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_Byte,
     818             :                         GDT_Byte, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     819             :                 }
     820      110387 :                 else if (eDataType == eBufType)
     821             :                 {
     822      110182 :                     switch (nBandDataSize)
     823             :                     {
     824      110102 :                         case 2:
     825      110102 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     826             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     827             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     828             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     829             :                                 nBlockXSize, poBlock, nLBlockY);
     830      110102 :                             break;
     831          22 :                         case 4:
     832          22 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     833             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     834             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     835             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     836             :                                 nBlockXSize, poBlock, nLBlockY);
     837          22 :                             break;
     838          56 :                         case 8:
     839          56 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     840             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     841             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     842             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     843             :                                 nBlockXSize, poBlock, nLBlockY);
     844          56 :                             break;
     845           2 :                         case 16:
     846           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     847             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     848             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     849             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     850             :                                 nBlockXSize, poBlock, nLBlockY);
     851           2 :                             break;
     852           0 :                         default:
     853           0 :                             CPLAssert(false);
     854             :                             break;
     855             :                     }
     856             :                 }
     857             :                 else
     858             :                 {
     859         205 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     860             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     861             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     862             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     863             :                 }
     864      413252 :                 if (!bRet)
     865           1 :                     eErr = CE_Failure;
     866             :             }
     867             :             else
     868             :             {
     869     1348380 :                 double dfSrcX = dfSrcXStart;
     870   570400000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     871   569052000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     872             :                 {
     873             :                     // TODO?: try to avoid the clamping for most iterations
     874             :                     const int iSrcX = static_cast<int>(
     875  1138100000 :                         std::min(std::max(0.0, dfSrcX),
     876   569052000 :                                  static_cast<double>(nRasterXSize - 1)));
     877             : 
     878             :                     /* --------------------------------------------------------------------
     879             :                      */
     880             :                     /*      Ensure we have the appropriate block loaded. */
     881             :                     /* --------------------------------------------------------------------
     882             :                      */
     883   569052000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     884             :                     {
     885     1705380 :                         const int nLBlockX = iSrcX / nBlockXSize;
     886     1705380 :                         nStartBlockX = nLBlockX * nBlockXSize;
     887             : 
     888     1705380 :                         if (poBlock != nullptr)
     889     1583740 :                             poBlock->DropLock();
     890             : 
     891     1705380 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     892     1705380 :                         if (poBlock == nullptr)
     893             :                         {
     894           9 :                             eErr = CE_Failure;
     895           9 :                             break;
     896             :                         }
     897             : 
     898             :                         pabySrcBlock =
     899     1705370 :                             static_cast<GByte *>(poBlock->GetDataRef());
     900             :                     }
     901   569052000 :                     const GPtrDiff_t nDiffX =
     902   569052000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     903             : 
     904             :                     /* --------------------------------------------------------------------
     905             :                      */
     906             :                     /*      Copy over this pixel of data. */
     907             :                     /* --------------------------------------------------------------------
     908             :                      */
     909             : 
     910   569052000 :                     if (bByteCopy)
     911             :                     {
     912   515613000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     913   515613000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     914   515613000 :                             pabySrcBlock[iSrcOffset];
     915             :                     }
     916    53439100 :                     else if (eDataType == eBufType)
     917             :                     {
     918    48225500 :                         GPtrDiff_t iSrcOffset =
     919    48225500 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     920    48225500 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     921    48225500 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     922             :                     }
     923             :                     else
     924             :                     {
     925             :                         // Type to type conversion ...
     926     5213610 :                         GPtrDiff_t iSrcOffset =
     927     5213610 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     928     5213610 :                         GDALCopyWords(pabySrcBlock + iSrcOffset, eDataType, 0,
     929     5213610 :                                       static_cast<GByte *>(pData) + iBufOffset,
     930             :                                       eBufType, 0, 1);
     931             :                     }
     932             : 
     933   569052000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     934             :                 }
     935             :             }
     936     1761630 :             if (eErr == CE_Failure)
     937          11 :                 break;
     938             : 
     939     1972790 :             if (psExtraArg->pfnProgress != nullptr &&
     940      211166 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     941             :                                          psExtraArg->pProgressData))
     942             :             {
     943           1 :                 eErr = CE_Failure;
     944           1 :                 break;
     945             :             }
     946             :         }
     947             :     }
     948             : 
     949      360269 :     if (poBlock != nullptr)
     950      360259 :         poBlock->DropLock();
     951             : 
     952      360269 :     return eErr;
     953             : }
     954             : 
     955             : /************************************************************************/
     956             : /*                         GDALRasterIOTransformer()                    */
     957             : /************************************************************************/
     958             : 
     959             : struct GDALRasterIOTransformerStruct
     960             : {
     961             :     double dfXOff;
     962             :     double dfYOff;
     963             :     double dfXRatioDstToSrc;
     964             :     double dfYRatioDstToSrc;
     965             : };
     966             : 
     967        6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
     968             :                                    int nPointCount, double *x, double *y,
     969             :                                    double * /* z */, int *panSuccess)
     970             : {
     971        6748 :     GDALRasterIOTransformerStruct *psParams =
     972             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
     973        6748 :     if (bDstToSrc)
     974             :     {
     975      252996 :         for (int i = 0; i < nPointCount; i++)
     976             :         {
     977      246836 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
     978      246836 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
     979      246836 :             panSuccess[i] = TRUE;
     980             :         }
     981             :     }
     982             :     else
     983             :     {
     984        1176 :         for (int i = 0; i < nPointCount; i++)
     985             :         {
     986         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
     987         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
     988         588 :             panSuccess[i] = TRUE;
     989             :         }
     990             :     }
     991        6748 :     return TRUE;
     992             : }
     993             : 
     994             : /************************************************************************/
     995             : /*                          RasterIOResampled()                         */
     996             : /************************************************************************/
     997             : 
     998             : //! @cond Doxygen_Suppress
     999        2561 : CPLErr GDALRasterBand::RasterIOResampled(
    1000             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1001             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1002             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1003             : {
    1004             :     // Determine if we use warping resampling or overview resampling
    1005             :     const bool bUseWarp =
    1006        2561 :         (GDALDataTypeIsComplex(eDataType) &&
    1007        2718 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1008         157 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1009             : 
    1010        2561 :     double dfXOff = nXOff;
    1011        2561 :     double dfYOff = nYOff;
    1012        2561 :     double dfXSize = nXSize;
    1013        2561 :     double dfYSize = nYSize;
    1014        2561 :     if (psExtraArg->bFloatingPointWindowValidity)
    1015             :     {
    1016        2114 :         dfXOff = psExtraArg->dfXOff;
    1017        2114 :         dfYOff = psExtraArg->dfYOff;
    1018        2114 :         dfXSize = psExtraArg->dfXSize;
    1019        2114 :         dfYSize = psExtraArg->dfYSize;
    1020             :     }
    1021             : 
    1022        2561 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1023        2561 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1024             : 
    1025             :     // Determine the coordinates in the "virtual" output raster to see
    1026             :     // if there are not integers, in which case we will use them as a shift
    1027             :     // so that subwindow extracts give the exact same results as entire raster
    1028             :     // scaling.
    1029        2561 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1030        2561 :     bool bHasXOffVirtual = false;
    1031        2561 :     int nDestXOffVirtual = 0;
    1032        2561 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1033             :     {
    1034        2235 :         bHasXOffVirtual = true;
    1035        2235 :         dfXOff = nXOff;
    1036        2235 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1037             :     }
    1038             : 
    1039        2561 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1040        2561 :     bool bHasYOffVirtual = false;
    1041        2561 :     int nDestYOffVirtual = 0;
    1042        2561 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1043             :     {
    1044        2229 :         bHasYOffVirtual = true;
    1045        2229 :         dfYOff = nYOff;
    1046        2229 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1047             :     }
    1048             : 
    1049             :     // Create a MEM dataset that wraps the output buffer.
    1050             :     GDALDataset *poMEMDS;
    1051        2561 :     void *pTempBuffer = nullptr;
    1052        2561 :     GSpacing nPSMem = nPixelSpace;
    1053        2561 :     GSpacing nLSMem = nLineSpace;
    1054        2561 :     void *pDataMem = pData;
    1055        2561 :     GDALDataType eDTMem = eBufType;
    1056        2561 :     if (eBufType != eDataType)
    1057             :     {
    1058          40 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1059          40 :         nLSMem = nPSMem * nBufXSize;
    1060             :         pTempBuffer =
    1061          40 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1062          40 :         if (pTempBuffer == nullptr)
    1063           0 :             return CE_Failure;
    1064          40 :         pDataMem = pTempBuffer;
    1065          40 :         eDTMem = eDataType;
    1066             :     }
    1067             : 
    1068             :     poMEMDS =
    1069        2561 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1070             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1071        2561 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1072        2561 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1073        2561 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1074             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1075        2561 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1076             : 
    1077        2561 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1078        2561 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1079        2561 :     if (pszNBITS)
    1080           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1081           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1082             : 
    1083        2561 :     CPLErr eErr = CE_None;
    1084             : 
    1085             :     // Do the resampling.
    1086        2561 :     if (bUseWarp)
    1087             :     {
    1088         149 :         int bHasNoData = FALSE;
    1089         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1090             : 
    1091         149 :         VRTDatasetH hVRTDS = nullptr;
    1092         149 :         GDALRasterBandH hVRTBand = nullptr;
    1093         149 :         if (GetDataset() == nullptr)
    1094             :         {
    1095             :             /* Create VRT dataset that wraps the whole dataset */
    1096           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1097           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1098           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1099           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1100             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1101             :                                VRT_NODATA_UNSET);
    1102             : 
    1103             :             /* Add a mask band if needed */
    1104           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1105             :             {
    1106           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1107             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1108             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1109             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1110           0 :                             ->GetMaskBand());
    1111           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1112           0 :                                                  nRasterYSize, 0, 0,
    1113           0 :                                                  nRasterXSize, nRasterYSize);
    1114             :             }
    1115             :         }
    1116             : 
    1117         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1118         149 :         switch (psExtraArg->eResampleAlg)
    1119             :         {
    1120           0 :             case GRIORA_NearestNeighbour:
    1121           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1122           0 :                 break;
    1123         147 :             case GRIORA_Bilinear:
    1124         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1125         147 :                 break;
    1126           0 :             case GRIORA_Cubic:
    1127           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1128           0 :                 break;
    1129           0 :             case GRIORA_CubicSpline:
    1130           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1131           0 :                 break;
    1132           0 :             case GRIORA_Lanczos:
    1133           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1134           0 :                 break;
    1135           0 :             case GRIORA_Average:
    1136           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1137           0 :                 break;
    1138           2 :             case GRIORA_RMS:
    1139           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1140           2 :                 break;
    1141           0 :             case GRIORA_Mode:
    1142           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1143           0 :                 break;
    1144           0 :             default:
    1145           0 :                 CPLAssert(false);
    1146             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1147             :                 break;
    1148             :         }
    1149         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1150         149 :         psWarpOptions->hDstDS = poMEMDS;
    1151         149 :         psWarpOptions->nBandCount = 1;
    1152         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1153         149 :         int nDstBandNumber = 1;
    1154         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1155         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1156         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1157         149 :                                          ? psExtraArg->pfnProgress
    1158             :                                          : GDALDummyProgress;
    1159         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1160         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1161         149 :         if (bHasNoData)
    1162             :         {
    1163           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1164             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1165           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1166             :             {
    1167           0 :                 psWarpOptions->padfSrcNoDataReal =
    1168           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1169           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1170             :             }
    1171             : 
    1172           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1173             :             {
    1174           0 :                 psWarpOptions->padfDstNoDataReal =
    1175           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1176           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1177             :             }
    1178             :         }
    1179             : 
    1180             :         GDALRasterIOTransformerStruct sTransformer;
    1181         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1182         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1183         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1184         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1185         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1186             : 
    1187             :         GDALWarpOperationH hWarpOperation =
    1188         149 :             GDALCreateWarpOperation(psWarpOptions);
    1189         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1190             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1191         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1192             : 
    1193         149 :         psWarpOptions->panSrcBands = nullptr;
    1194         149 :         psWarpOptions->panDstBands = nullptr;
    1195         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1196             : 
    1197         149 :         if (hVRTDS)
    1198           0 :             GDALClose(hVRTDS);
    1199             :     }
    1200             :     else
    1201             :     {
    1202        2412 :         const char *pszResampling =
    1203        2594 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1204         293 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1205         220 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1206         213 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1207         159 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1208          95 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1209          43 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1210           3 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1211             :                                                                : "UNKNOWN";
    1212             : 
    1213        2412 :         int nKernelRadius = 0;
    1214             :         GDALResampleFunction pfnResampleFunc =
    1215        2412 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1216        2412 :         CPLAssert(pfnResampleFunc);
    1217             :         GDALDataType eWrkDataType =
    1218        2412 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1219        2412 :         int nHasNoData = 0;
    1220        2412 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1221        2412 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1222        2412 :         if (!bHasNoData)
    1223        2348 :             dfNoDataValue = 0.0;
    1224             : 
    1225        2412 :         int nDstBlockXSize = nBufXSize;
    1226        2412 :         int nDstBlockYSize = nBufYSize;
    1227        2412 :         int nFullResXChunk = 0;
    1228        2412 :         int nFullResYChunk = 0;
    1229             :         while (true)
    1230             :         {
    1231        2412 :             nFullResXChunk =
    1232        2412 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1233        2412 :             nFullResYChunk =
    1234        2412 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1235        2412 :             if (nFullResXChunk > nRasterXSize)
    1236        2223 :                 nFullResXChunk = nRasterXSize;
    1237        2412 :             if (nFullResYChunk > nRasterYSize)
    1238         206 :                 nFullResYChunk = nRasterYSize;
    1239        2412 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1240        2366 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1241             :                  1024 * 1024))
    1242             :                 break;
    1243             :             // When operating on the full width of a raster whose block width is
    1244             :             // the raster width, prefer doing chunks in height.
    1245           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1246             :                 nDstBlockYSize > 1)
    1247           0 :                 nDstBlockYSize /= 2;
    1248             :             /* Otherwise cut the maximal dimension */
    1249           0 :             else if (nDstBlockXSize > 1 &&
    1250           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1251           0 :                 nDstBlockXSize /= 2;
    1252             :             else
    1253           0 :                 nDstBlockYSize /= 2;
    1254             :         }
    1255             : 
    1256        2412 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1257        2412 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1258        2412 :         if (nOvrXFactor == 0)
    1259        2024 :             nOvrXFactor = 1;
    1260        2412 :         if (nOvrYFactor == 0)
    1261        2023 :             nOvrYFactor = 1;
    1262        2412 :         int nFullResXSizeQueried =
    1263        2412 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1264        2412 :         int nFullResYSizeQueried =
    1265        2412 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1266             : 
    1267        2412 :         if (nFullResXSizeQueried > nRasterXSize)
    1268        2125 :             nFullResXSizeQueried = nRasterXSize;
    1269        2412 :         if (nFullResYSizeQueried > nRasterYSize)
    1270         119 :             nFullResYSizeQueried = nRasterYSize;
    1271             : 
    1272             :         void *pChunk =
    1273        2412 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1274             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1275        2412 :         GByte *pabyChunkNoDataMask = nullptr;
    1276             : 
    1277        2412 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1278        2412 :         int l_nMaskFlags = GetMaskFlags();
    1279             : 
    1280        2412 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1281        2412 :         if (bUseNoDataMask)
    1282             :         {
    1283         120 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1284             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1285             :         }
    1286        2412 :         if (pChunk == nullptr ||
    1287         120 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1288             :         {
    1289           0 :             GDALClose(poMEMDS);
    1290           0 :             CPLFree(pChunk);
    1291           0 :             CPLFree(pabyChunkNoDataMask);
    1292           0 :             VSIFree(pTempBuffer);
    1293           0 :             return CE_Failure;
    1294             :         }
    1295             : 
    1296        2412 :         int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
    1297        2412 :                            ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
    1298        2412 :         int nBlocksDone = 0;
    1299             : 
    1300             :         int nDstYOff;
    1301        4824 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1302        2412 :              nDstYOff += nDstBlockYSize)
    1303             :         {
    1304             :             int nDstYCount;
    1305        2412 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1306        2412 :                 nDstYCount = nDstBlockYSize;
    1307             :             else
    1308           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1309             : 
    1310        2412 :             int nChunkYOff =
    1311        2412 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1312        2412 :             int nChunkYOff2 = nYOff + 1 +
    1313        2412 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1314             :                                                     dfYRatioDstToSrc));
    1315        2412 :             if (nChunkYOff2 > nRasterYSize)
    1316         313 :                 nChunkYOff2 = nRasterYSize;
    1317        2412 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1318        2412 :             CPLAssert(nYCount <= nFullResYChunk);
    1319             : 
    1320        2412 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1321        2412 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1322        2412 :             if (nChunkYOffQueried < 0)
    1323             :             {
    1324         221 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1325         221 :                 nChunkYOffQueried = 0;
    1326             :             }
    1327        2412 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1328         321 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1329        2412 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1330             : 
    1331        2412 :             int nDstXOff = 0;
    1332        4824 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1333        2412 :                  nDstXOff += nDstBlockXSize)
    1334             :             {
    1335        2412 :                 int nDstXCount = 0;
    1336        2412 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1337        2412 :                     nDstXCount = nDstBlockXSize;
    1338             :                 else
    1339           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1340             : 
    1341        2412 :                 int nChunkXOff =
    1342        2412 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1343        2412 :                 int nChunkXOff2 =
    1344        2412 :                     nXOff + 1 +
    1345        2412 :                     static_cast<int>(
    1346        2412 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1347        2412 :                 if (nChunkXOff2 > nRasterXSize)
    1348        2224 :                     nChunkXOff2 = nRasterXSize;
    1349        2412 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1350        2412 :                 CPLAssert(nXCount <= nFullResXChunk);
    1351             : 
    1352        2412 :                 int nChunkXOffQueried =
    1353        2412 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1354        2412 :                 int nChunkXSizeQueried =
    1355        2412 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1356        2412 :                 if (nChunkXOffQueried < 0)
    1357             :                 {
    1358        2138 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1359        2138 :                     nChunkXOffQueried = 0;
    1360             :                 }
    1361        2412 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1362        2124 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1363        2412 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1364             : 
    1365             :                 // Read the source buffers.
    1366        2412 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1367             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1368             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1369             :                                 eWrkDataType, 0, 0, nullptr);
    1370             : 
    1371        2412 :                 bool bSkipResample = false;
    1372        2412 :                 bool bNoDataMaskFullyOpaque = false;
    1373        2412 :                 if (eErr == CE_None && bUseNoDataMask)
    1374             :                 {
    1375         120 :                     eErr = poMaskBand->RasterIO(
    1376             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1377             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1378             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1379             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1380             : 
    1381             :                     /* Optimizations if mask if fully opaque or transparent */
    1382         120 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1383         120 :                     GByte bVal = pabyChunkNoDataMask[0];
    1384         120 :                     int i = 1;
    1385      235310 :                     for (; i < nPixels; i++)
    1386             :                     {
    1387      235267 :                         if (pabyChunkNoDataMask[i] != bVal)
    1388          77 :                             break;
    1389             :                     }
    1390         120 :                     if (i == nPixels)
    1391             :                     {
    1392          43 :                         if (bVal == 0)
    1393             :                         {
    1394         712 :                             for (int j = 0; j < nDstYCount; j++)
    1395             :                             {
    1396         686 :                                 GDALCopyWords(&dfNoDataValue, GDT_Float64, 0,
    1397             :                                               static_cast<GByte *>(pDataMem) +
    1398         686 :                                                   nLSMem * (j + nDstYOff) +
    1399         686 :                                                   nDstXOff * nPSMem,
    1400             :                                               eDTMem, static_cast<int>(nPSMem),
    1401             :                                               nDstXCount);
    1402             :                             }
    1403          26 :                             bSkipResample = true;
    1404             :                         }
    1405             :                         else
    1406             :                         {
    1407          17 :                             bNoDataMaskFullyOpaque = true;
    1408             :                         }
    1409             :                     }
    1410             :                 }
    1411             : 
    1412        2412 :                 if (!bSkipResample && eErr == CE_None)
    1413             :                 {
    1414        2384 :                     const bool bPropagateNoData = false;
    1415        2384 :                     void *pDstBuffer = nullptr;
    1416        2384 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1417             :                     GDALRasterBand *poMEMBand =
    1418        2384 :                         GDALRasterBand::FromHandle(hMEMBand);
    1419        2384 :                     GDALOverviewResampleArgs args;
    1420        2384 :                     args.eSrcDataType = eDataType;
    1421        2384 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1422        2384 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1423        2384 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1424        2384 :                     args.nOvrNBITS = nNBITS;
    1425        2384 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1426        2384 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1427        2384 :                     args.dfSrcXDelta =
    1428        2384 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1429        2384 :                     args.dfSrcYDelta =
    1430        2384 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1431        2384 :                     args.eWrkDataType = eWrkDataType;
    1432        2384 :                     args.pabyChunkNodataMask =
    1433        2384 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1434        2384 :                     args.nChunkXOff =
    1435        2384 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1436        2384 :                     args.nChunkXSize = nChunkXSizeQueried;
    1437        2384 :                     args.nChunkYOff =
    1438        2384 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1439        2384 :                     args.nChunkYSize = nChunkYSizeQueried;
    1440        2384 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1441        2384 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1442        2384 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1443        2384 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1444        2384 :                     args.pszResampling = pszResampling;
    1445        2384 :                     args.bHasNoData = bHasNoData;
    1446        2384 :                     args.dfNoDataValue = dfNoDataValue;
    1447        2384 :                     args.poColorTable = GetColorTable();
    1448        2384 :                     args.bPropagateNoData = bPropagateNoData;
    1449        2384 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1450             :                                            &eDstBufferDataType);
    1451        2384 :                     if (eErr == CE_None)
    1452             :                     {
    1453        2384 :                         eErr = poMEMBand->RasterIO(
    1454             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1455             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1456             :                             pDstBuffer, nDstXCount, nDstYCount,
    1457             :                             eDstBufferDataType, 0, 0, nullptr);
    1458             :                     }
    1459        2384 :                     CPLFree(pDstBuffer);
    1460             :                 }
    1461             : 
    1462        2412 :                 nBlocksDone++;
    1463        2441 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1464          29 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1465             :                                              "", psExtraArg->pProgressData))
    1466             :                 {
    1467           1 :                     eErr = CE_Failure;
    1468             :                 }
    1469             :             }
    1470             :         }
    1471             : 
    1472        2412 :         CPLFree(pChunk);
    1473        2412 :         CPLFree(pabyChunkNoDataMask);
    1474             :     }
    1475             : 
    1476        2561 :     if (eBufType != eDataType)
    1477             :     {
    1478          40 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1479             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1480             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1481             :             nullptr));
    1482             :     }
    1483        2561 :     GDALClose(poMEMDS);
    1484        2561 :     VSIFree(pTempBuffer);
    1485             : 
    1486        2561 :     return eErr;
    1487             : }
    1488             : 
    1489             : /************************************************************************/
    1490             : /*                          RasterIOResampled()                         */
    1491             : /************************************************************************/
    1492             : 
    1493         279 : CPLErr GDALDataset::RasterIOResampled(
    1494             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1495             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1496             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1497             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1498             : 
    1499             : {
    1500             : #if 0
    1501             :     // Determine if we use warping resampling or overview resampling
    1502             :     bool bUseWarp = false;
    1503             :     if( GDALDataTypeIsComplex( eDataType ) )
    1504             :         bUseWarp = true;
    1505             : #endif
    1506             : 
    1507         279 :     double dfXOff = nXOff;
    1508         279 :     double dfYOff = nYOff;
    1509         279 :     double dfXSize = nXSize;
    1510         279 :     double dfYSize = nYSize;
    1511         279 :     if (psExtraArg->bFloatingPointWindowValidity)
    1512             :     {
    1513         157 :         dfXOff = psExtraArg->dfXOff;
    1514         157 :         dfYOff = psExtraArg->dfYOff;
    1515         157 :         dfXSize = psExtraArg->dfXSize;
    1516         157 :         dfYSize = psExtraArg->dfYSize;
    1517             :     }
    1518             : 
    1519         279 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1520         279 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1521             : 
    1522             :     // Determine the coordinates in the "virtual" output raster to see
    1523             :     // if there are not integers, in which case we will use them as a shift
    1524             :     // so that subwindow extracts give the exact same results as entire raster
    1525             :     // scaling.
    1526         279 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1527         279 :     bool bHasXOffVirtual = false;
    1528         279 :     int nDestXOffVirtual = 0;
    1529         279 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1530             :     {
    1531         162 :         bHasXOffVirtual = true;
    1532         162 :         dfXOff = nXOff;
    1533         162 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1534             :     }
    1535             : 
    1536         279 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1537         279 :     bool bHasYOffVirtual = false;
    1538         279 :     int nDestYOffVirtual = 0;
    1539         279 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1540             :     {
    1541         121 :         bHasYOffVirtual = true;
    1542         121 :         dfYOff = nYOff;
    1543         121 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1544             :     }
    1545             : 
    1546             :     // Create a MEM dataset that wraps the output buffer.
    1547             :     GDALDataset *poMEMDS =
    1548         279 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1549             :                            nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
    1550             :     GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
    1551         277 :         CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
    1552         265 :     int nNBITS = 0;
    1553        1203 :     for (int i = 0; i < nBandCount; i++)
    1554             :     {
    1555         932 :         char szBuffer[32] = {'\0'};
    1556        1880 :         int nRet = CPLPrintPointer(
    1557             :             szBuffer,
    1558         932 :             static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
    1559         932 :                 nLineSpace * nDestYOffVirtual + nBandSpace * i,
    1560             :             sizeof(szBuffer));
    1561         948 :         szBuffer[nRet] = 0;
    1562             : 
    1563         948 :         char szBuffer0[64] = {'\0'};
    1564         948 :         snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
    1565             : 
    1566         948 :         char szBuffer1[64] = {'\0'};
    1567         948 :         snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
    1568             :                  static_cast<GIntBig>(nPixelSpace));
    1569             : 
    1570         948 :         char szBuffer2[64] = {'\0'};
    1571         948 :         snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
    1572             :                  static_cast<GIntBig>(nLineSpace));
    1573             : 
    1574         948 :         char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
    1575             : 
    1576         948 :         poMEMDS->AddBand(eBufType, apszOptions);
    1577             : 
    1578         935 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1579         919 :         papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
    1580             :         const char *pszNBITS =
    1581         923 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1582         925 :         if (pszNBITS)
    1583             :         {
    1584           0 :             nNBITS = atoi(pszNBITS);
    1585           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1586           0 :                                                            "IMAGE_STRUCTURE");
    1587             :         }
    1588             :     }
    1589             : 
    1590         271 :     CPLErr eErr = CE_None;
    1591             : 
    1592             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1593             :     // Looks like this code was initially added as disable by copying
    1594             :     // from RasterIO here:
    1595             :     // https://trac.osgeo.org/gdal/changeset/29572
    1596             : #if 0
    1597             :     // Do the resampling.
    1598             :     if( bUseWarp )
    1599             :     {
    1600             :         VRTDatasetH hVRTDS = nullptr;
    1601             :         GDALRasterBandH hVRTBand = nullptr;
    1602             :         if( GetDataset() == nullptr )
    1603             :         {
    1604             :             /* Create VRT dataset that wraps the whole dataset */
    1605             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1606             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1607             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1608             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1609             :                                 (GDALRasterBandH)this,
    1610             :                                 0, 0,
    1611             :                                 nRasterXSize, nRasterYSize,
    1612             :                                 0, 0,
    1613             :                                 nRasterXSize, nRasterYSize,
    1614             :                                 nullptr, VRT_NODATA_UNSET );
    1615             : 
    1616             :             /* Add a mask band if needed */
    1617             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1618             :             {
    1619             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1620             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1621             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1622             :                 poVRTMaskBand->
    1623             :                     AddMaskBandSource( this,
    1624             :                                     0, 0,
    1625             :                                     nRasterXSize, nRasterYSize,
    1626             :                                     0, 0,
    1627             :                                     nRasterXSize, nRasterYSize);
    1628             :             }
    1629             :         }
    1630             : 
    1631             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1632             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1633             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1634             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1635             :         psWarpOptions->nBandCount = 1;
    1636             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1637             :         int nDstBandNumber = 1;
    1638             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1639             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1640             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1641             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1642             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1643             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1644             :         GDALRasterIOTransformerStruct sTransformer;
    1645             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1646             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1647             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1648             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1649             :         psWarpOptions->pTransformerArg = &sTransformer;
    1650             : 
    1651             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1652             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1653             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1654             :                                       nBufXSize, nBufYSize );
    1655             :         GDALDestroyWarpOperation( hWarpOperation );
    1656             : 
    1657             :         psWarpOptions->panSrcBands = nullptr;
    1658             :         psWarpOptions->panDstBands = nullptr;
    1659             :         GDALDestroyWarpOptions( psWarpOptions );
    1660             : 
    1661             :         if( hVRTDS )
    1662             :             GDALClose(hVRTDS);
    1663             :     }
    1664             :     else
    1665             : #endif
    1666             :     {
    1667         271 :         const char *pszResampling =
    1668         427 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1669         156 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1670           0 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1671           0 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1672           0 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1673           0 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1674           0 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1675           0 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1676             :                                                                : "UNKNOWN";
    1677             : 
    1678         271 :         GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1679         268 :         GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1680             :         int nBlockXSize, nBlockYSize;
    1681         271 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1682             : 
    1683             :         int nKernelRadius;
    1684             :         GDALResampleFunction pfnResampleFunc =
    1685         259 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1686         259 :         CPLAssert(pfnResampleFunc);
    1687             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1688             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1689             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1690             : #endif
    1691             :         GDALDataType eWrkDataType =
    1692         259 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1693             : 
    1694         259 :         int nDstBlockXSize = nBufXSize;
    1695         259 :         int nDstBlockYSize = nBufYSize;
    1696             :         int nFullResXChunk, nFullResYChunk;
    1697             :         while (true)
    1698             :         {
    1699         259 :             nFullResXChunk =
    1700         259 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1701         259 :             nFullResYChunk =
    1702         259 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1703         259 :             if (nFullResXChunk > nRasterXSize)
    1704         139 :                 nFullResXChunk = nRasterXSize;
    1705         259 :             if (nFullResYChunk > nRasterYSize)
    1706          33 :                 nFullResYChunk = nRasterYSize;
    1707         259 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1708         257 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1709             :                  1024 * 1024))
    1710             :                 break;
    1711             :             // When operating on the full width of a raster whose block width is
    1712             :             // the raster width, prefer doing chunks in height.
    1713           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1714             :                 nDstBlockYSize > 1)
    1715           0 :                 nDstBlockYSize /= 2;
    1716             :             /* Otherwise cut the maximal dimension */
    1717           0 :             else if (nDstBlockXSize > 1 &&
    1718           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1719           0 :                 nDstBlockXSize /= 2;
    1720             :             else
    1721           0 :                 nDstBlockYSize /= 2;
    1722             :         }
    1723             : 
    1724         524 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1725         259 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1726         265 :         if (nOvrFactor == 0)
    1727          90 :             nOvrFactor = 1;
    1728         265 :         int nFullResXSizeQueried =
    1729         265 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1730         265 :         int nFullResYSizeQueried =
    1731         265 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1732             : 
    1733         265 :         if (nFullResXSizeQueried > nRasterXSize)
    1734         158 :             nFullResXSizeQueried = nRasterXSize;
    1735         265 :         if (nFullResYSizeQueried > nRasterYSize)
    1736          36 :             nFullResYSizeQueried = nRasterYSize;
    1737             : 
    1738         265 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1739             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1740             :                               nBandCount),
    1741             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1742         277 :         GByte *pabyChunkNoDataMask = nullptr;
    1743             : 
    1744         277 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1745         272 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1746             : 
    1747         274 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1748         274 :         if (bUseNoDataMask)
    1749             :         {
    1750          55 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1751             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1752             :         }
    1753         274 :         if (pChunk == nullptr ||
    1754          55 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1755             :         {
    1756          12 :             GDALClose(poMEMDS);
    1757           0 :             CPLFree(pChunk);
    1758           0 :             CPLFree(pabyChunkNoDataMask);
    1759           0 :             CPLFree(papoDstBands);
    1760           0 :             return CE_Failure;
    1761             :         }
    1762             : 
    1763         262 :         int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
    1764         262 :                            ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
    1765         262 :         int nBlocksDone = 0;
    1766             : 
    1767             :         int nDstYOff;
    1768         544 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1769         282 :              nDstYOff += nDstBlockYSize)
    1770             :         {
    1771             :             int nDstYCount;
    1772         258 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1773         261 :                 nDstYCount = nDstBlockYSize;
    1774             :             else
    1775           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1776             : 
    1777         258 :             int nChunkYOff =
    1778         258 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1779         258 :             int nChunkYOff2 = nYOff + 1 +
    1780         258 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1781             :                                                     dfYRatioDstToSrc));
    1782         258 :             if (nChunkYOff2 > nRasterYSize)
    1783          56 :                 nChunkYOff2 = nRasterYSize;
    1784         258 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1785         258 :             CPLAssert(nYCount <= nFullResYChunk);
    1786             : 
    1787         258 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1788         258 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1789         258 :             if (nChunkYOffQueried < 0)
    1790             :             {
    1791          56 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1792          56 :                 nChunkYOffQueried = 0;
    1793             :             }
    1794         258 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1795          66 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1796         258 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1797             : 
    1798             :             int nDstXOff;
    1799         538 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1800         280 :                  nDstXOff += nDstBlockXSize)
    1801             :             {
    1802             :                 int nDstXCount;
    1803         256 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1804         256 :                     nDstXCount = nDstBlockXSize;
    1805             :                 else
    1806           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1807             : 
    1808         256 :                 int nChunkXOff =
    1809         256 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1810         256 :                 int nChunkXOff2 =
    1811         256 :                     nXOff + 1 +
    1812         256 :                     static_cast<int>(
    1813         256 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1814         256 :                 if (nChunkXOff2 > nRasterXSize)
    1815         144 :                     nChunkXOff2 = nRasterXSize;
    1816         256 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1817         256 :                 CPLAssert(nXCount <= nFullResXChunk);
    1818             : 
    1819         256 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1820         256 :                 int nChunkXSizeQueried =
    1821         256 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1822         256 :                 if (nChunkXOffQueried < 0)
    1823             :                 {
    1824         136 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1825         136 :                     nChunkXOffQueried = 0;
    1826             :                 }
    1827         256 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1828         144 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1829         256 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1830             : 
    1831         256 :                 bool bSkipResample = false;
    1832         256 :                 bool bNoDataMaskFullyOpaque = false;
    1833         256 :                 if (eErr == CE_None && bUseNoDataMask)
    1834             :                 {
    1835          55 :                     eErr = poMaskBand->RasterIO(
    1836             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1837             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1838             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1839             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1840             : 
    1841             :                     /* Optimizations if mask if fully opaque or transparent */
    1842          55 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1843          55 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1844          55 :                     int i = 1;  // Used after for.
    1845      123794 :                     for (; i < nPixels; i++)
    1846             :                     {
    1847      123777 :                         if (pabyChunkNoDataMask[i] != bVal)
    1848          38 :                             break;
    1849             :                     }
    1850          55 :                     if (i == nPixels)
    1851             :                     {
    1852          17 :                         if (bVal == 0)
    1853             :                         {
    1854          16 :                             GByte abyZero[16] = {0};
    1855          64 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1856             :                             {
    1857        2016 :                                 for (int j = 0; j < nDstYCount; j++)
    1858             :                                 {
    1859        1968 :                                     GDALCopyWords(
    1860             :                                         abyZero, GDT_Byte, 0,
    1861             :                                         static_cast<GByte *>(pData) +
    1862        1968 :                                             iBand * nBandSpace +
    1863        1968 :                                             nLineSpace * (j + nDstYOff) +
    1864        1968 :                                             nDstXOff * nPixelSpace,
    1865             :                                         eBufType, static_cast<int>(nPixelSpace),
    1866             :                                         nDstXCount);
    1867             :                                 }
    1868             :                             }
    1869          16 :                             bSkipResample = true;
    1870             :                         }
    1871             :                         else
    1872             :                         {
    1873           1 :                             bNoDataMaskFullyOpaque = true;
    1874             :                         }
    1875             :                     }
    1876             :                 }
    1877             : 
    1878         256 :                 if (!bSkipResample && eErr == CE_None)
    1879             :                 {
    1880             :                     /* Read the source buffers */
    1881         244 :                     eErr = RasterIO(
    1882             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1883             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1884             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1885             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1886             :                 }
    1887             : 
    1888             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1889             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1890             :                     eErr == CE_None)
    1891             :                 {
    1892             :                     eErr = pfnResampleFuncMultiBands(
    1893             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1894             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1895             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1896             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1897             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1898             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1899             :                         nChunkXSizeQueried,
    1900             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1901             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1902             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1903             :                         nDstYOff + nDestYOffVirtual,
    1904             :                         nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
    1905             :                         pszResampling, FALSE /*bHasNoData*/,
    1906             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1907             :                         eDataType);
    1908             :                 }
    1909             :                 else
    1910             : #endif
    1911             :                 {
    1912             :                     size_t nChunkBandOffset =
    1913         274 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1914         274 :                         nChunkYSizeQueried *
    1915         274 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1916        1190 :                     for (int i = 0;
    1917        1190 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1918             :                          i++)
    1919             :                     {
    1920         910 :                         const bool bPropagateNoData = false;
    1921         910 :                         void *pDstBuffer = nullptr;
    1922         910 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1923             :                         GDALRasterBand *poMEMBand =
    1924         910 :                             poMEMDS->GetRasterBand(i + 1);
    1925         910 :                         GDALOverviewResampleArgs args;
    1926         910 :                         args.eSrcDataType = eDataType;
    1927         910 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1928         909 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1929         908 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1930         905 :                         args.nOvrNBITS = nNBITS;
    1931         905 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1932         905 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1933         905 :                         args.dfSrcXDelta =
    1934         905 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1935         905 :                         args.dfSrcYDelta =
    1936         905 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1937         905 :                         args.eWrkDataType = eWrkDataType;
    1938         905 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1939         905 :                                                        ? nullptr
    1940             :                                                        : pabyChunkNoDataMask;
    1941         905 :                         args.nChunkXOff =
    1942         905 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1943         905 :                         args.nChunkXSize = nChunkXSizeQueried;
    1944         905 :                         args.nChunkYOff =
    1945         905 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1946         905 :                         args.nChunkYSize = nChunkYSizeQueried;
    1947         905 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1948         905 :                         args.nDstXOff2 =
    1949         905 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1950         905 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1951         905 :                         args.nDstYOff2 =
    1952         905 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1953         905 :                         args.pszResampling = pszResampling;
    1954         905 :                         args.bHasNoData = false;
    1955         905 :                         args.dfNoDataValue = 0.0;
    1956         905 :                         args.poColorTable = nullptr;
    1957         905 :                         args.bPropagateNoData = bPropagateNoData;
    1958             : 
    1959             :                         eErr =
    1960        1815 :                             pfnResampleFunc(args,
    1961         905 :                                             reinterpret_cast<GByte *>(pChunk) +
    1962         905 :                                                 i * nChunkBandOffset,
    1963             :                                             &pDstBuffer, &eDstBufferDataType);
    1964         910 :                         if (eErr == CE_None)
    1965             :                         {
    1966         910 :                             eErr = poMEMBand->RasterIO(
    1967             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1968             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1969             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1970             :                                 eDstBufferDataType, 0, 0, nullptr);
    1971             :                         }
    1972         910 :                         CPLFree(pDstBuffer);
    1973             :                     }
    1974             :                 }
    1975             : 
    1976         280 :                 nBlocksDone++;
    1977         282 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1978           2 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1979             :                                              "", psExtraArg->pProgressData))
    1980             :                 {
    1981           0 :                     eErr = CE_Failure;
    1982             :                 }
    1983             :             }
    1984             :         }
    1985             : 
    1986         286 :         CPLFree(pChunk);
    1987         280 :         CPLFree(pabyChunkNoDataMask);
    1988             :     }
    1989             : 
    1990         280 :     CPLFree(papoDstBands);
    1991         280 :     GDALClose(poMEMDS);
    1992             : 
    1993         280 :     return eErr;
    1994             : }
    1995             : 
    1996             : //! @endcond
    1997             : 
    1998             : /************************************************************************/
    1999             : /*                           GDALSwapWords()                            */
    2000             : /************************************************************************/
    2001             : 
    2002             : /**
    2003             :  * Byte swap words in-place.
    2004             :  *
    2005             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2006             :  * a memory array.  No assumption is made that the words being swapped are
    2007             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2008             :  * to determine if the current platform is big endian or little endian.  Use
    2009             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2010             :  * of a function call.
    2011             :  *
    2012             :  * @param pData pointer to start of data buffer.
    2013             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2014             :  * @param nWordCount the number of words to be swapped in this call.
    2015             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2016             :  * the next. For packed buffers this is the same as nWordSize.
    2017             :  */
    2018             : 
    2019      438599 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2020             :                                int nWordSkip)
    2021             : 
    2022             : {
    2023      438599 :     if (nWordCount > 0)
    2024      438599 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2025             : 
    2026      438599 :     GByte *pabyData = static_cast<GByte *>(pData);
    2027             : 
    2028      438599 :     switch (nWordSize)
    2029             :     {
    2030        7204 :         case 1:
    2031        7204 :             break;
    2032             : 
    2033      418136 :         case 2:
    2034      418136 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2035   289092000 :             for (int i = 0; i < nWordCount; i++)
    2036             :             {
    2037   288673000 :                 CPL_SWAP16PTR(pabyData);
    2038   288673000 :                 pabyData += nWordSkip;
    2039             :             }
    2040      418136 :             break;
    2041             : 
    2042       10688 :         case 4:
    2043       10688 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2044       10688 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2045             :             {
    2046    29148000 :                 for (int i = 0; i < nWordCount; i++)
    2047             :                 {
    2048    29137300 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2049             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2050    29137300 :                     pabyData += nWordSkip;
    2051       10685 :                 }
    2052             :             }
    2053             :             else
    2054             :             {
    2055           9 :                 for (int i = 0; i < nWordCount; i++)
    2056             :                 {
    2057           6 :                     CPL_SWAP32PTR(pabyData);
    2058           6 :                     pabyData += nWordSkip;
    2059             :                 }
    2060             :             }
    2061       10688 :             break;
    2062             : 
    2063        2571 :         case 8:
    2064        2571 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2065             : #ifdef CPL_HAS_GINT64
    2066        2571 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2067             :             {
    2068     3359870 :                 for (int i = 0; i < nWordCount; i++)
    2069             :                 {
    2070     3357300 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2071             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2072     3357300 :                     pabyData += nWordSkip;
    2073        2570 :                 }
    2074             :             }
    2075             :             else
    2076             : #endif
    2077             :             {
    2078           3 :                 for (int i = 0; i < nWordCount; i++)
    2079             :                 {
    2080           2 :                     CPL_SWAP64PTR(pabyData);
    2081           2 :                     pabyData += nWordSkip;
    2082             :                 }
    2083             :             }
    2084        2571 :             break;
    2085             : 
    2086           0 :         default:
    2087           0 :             CPLAssert(false);
    2088             :     }
    2089             : }
    2090             : 
    2091             : /************************************************************************/
    2092             : /*                           GDALSwapWordsEx()                          */
    2093             : /************************************************************************/
    2094             : 
    2095             : /**
    2096             :  * Byte swap words in-place.
    2097             :  *
    2098             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2099             :  * a memory array.  No assumption is made that the words being swapped are
    2100             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2101             :  * to determine if the current platform is big endian or little endian.  Use
    2102             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2103             :  * of a function call.
    2104             :  *
    2105             :  * @param pData pointer to start of data buffer.
    2106             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2107             :  * @param nWordCount the number of words to be swapped in this call.
    2108             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2109             :  * the next. For packed buffers this is the same as nWordSize.
    2110             :  * @since GDAL 2.1
    2111             :  */
    2112        6328 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2113             :                                  int nWordSkip)
    2114             : {
    2115        6328 :     GByte *pabyData = static_cast<GByte *>(pData);
    2116       12656 :     while (nWordCount)
    2117             :     {
    2118             :         // Pick-up a multiple of 8 as max chunk size.
    2119        6328 :         const int nWordCountSmall =
    2120        6328 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2121        6328 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2122        6328 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2123        6328 :         nWordCount -= nWordCountSmall;
    2124             :     }
    2125        6328 : }
    2126             : 
    2127             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2128             : namespace
    2129             : {
    2130             : 
    2131             : /************************************************************************/
    2132             : /*                           GDALCopyWordsT()                           */
    2133             : /************************************************************************/
    2134             : /**
    2135             :  * Template function, used to copy data from pSrcData into buffer
    2136             :  * pDstData, with stride nSrcPixelStride in the source data and
    2137             :  * stride nDstPixelStride in the destination data. This template can
    2138             :  * deal with the case where the input data type is real or complex and
    2139             :  * the output is real.
    2140             :  *
    2141             :  * @param pSrcData the source data buffer
    2142             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2143             :  *                      of interest.
    2144             :  * @param pDstData the destination buffer.
    2145             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2146             :  *                      interest.
    2147             :  * @param nWordCount the total number of pixel words to copy
    2148             :  *
    2149             :  * @code
    2150             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2151             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2152             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2153             :  * @endcode
    2154             :  * @note
    2155             :  * This is a private function, and should not be exposed outside of
    2156             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2157             :  */
    2158             : 
    2159             : template <class Tin, class Tout>
    2160    46343755 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2161             :                                          int nSrcPixelStride,
    2162             :                                          Tout *const CPL_RESTRICT pDstData,
    2163             :                                          int nDstPixelStride,
    2164             :                                          GPtrDiff_t nWordCount)
    2165             : {
    2166    46343755 :     decltype(nWordCount) nDstOffset = 0;
    2167             : 
    2168    46343755 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2169    46343755 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2170   442519228 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2171             :     {
    2172   396168361 :         const Tin tValue =
    2173   396168361 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2174   396168361 :         Tout *const pOutPixel =
    2175   396168361 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2176             : 
    2177   396168361 :         GDALCopyWord(tValue, *pOutPixel);
    2178             : 
    2179   396175461 :         nDstOffset += nDstPixelStride;
    2180             :     }
    2181    46350885 : }
    2182             : 
    2183             : template <class Tin, class Tout>
    2184    38234082 : static void inline GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2185             :                                   int nSrcPixelStride,
    2186             :                                   Tout *const CPL_RESTRICT pDstData,
    2187             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    2188             : {
    2189    38234082 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2190             :                           nWordCount);
    2191    38234222 : }
    2192             : 
    2193             : template <class Tin, class Tout>
    2194      194232 : static void inline GDALCopyWordsT_8atatime(
    2195             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2196             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2197             :     GPtrDiff_t nWordCount)
    2198             : {
    2199      194232 :     decltype(nWordCount) nDstOffset = 0;
    2200             : 
    2201      194232 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2202      194232 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2203      194232 :     decltype(nWordCount) n = 0;
    2204      194232 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2205             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2206             :     {
    2207    22731350 :         for (; n < nWordCount - 7; n += 8)
    2208             :         {
    2209    22536468 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2210    22536468 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2211    22536468 :             Tout *const pOutPixels =
    2212    22536468 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2213             : 
    2214    22536468 :             GDALCopy8Words(pInValues, pOutPixels);
    2215             : 
    2216    22537998 :             nDstOffset += 8 * nDstPixelStride;
    2217             :         }
    2218             :     }
    2219      687113 :     for (; n < nWordCount; n++)
    2220             :     {
    2221      492891 :         const Tin tValue =
    2222      492891 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2223      492891 :         Tout *const pOutPixel =
    2224      492891 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2225             : 
    2226      492891 :         GDALCopyWord(tValue, *pOutPixel);
    2227             : 
    2228      491368 :         nDstOffset += nDstPixelStride;
    2229             :     }
    2230      194222 : }
    2231             : 
    2232             : #ifdef HAVE_SSE2
    2233             : 
    2234             : template <class Tout>
    2235       38739 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2236             :                               int nSrcPixelStride,
    2237             :                               Tout *const CPL_RESTRICT pDstData,
    2238             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2239             : {
    2240             :     static_assert(std::is_integral<Tout>::value &&
    2241             :                       sizeof(Tout) == sizeof(uint16_t),
    2242             :                   "Bad Tout");
    2243       38739 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2244             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2245             :     {
    2246       32688 :         decltype(nWordCount) n = 0;
    2247       32688 :         const __m128i xmm_zero = _mm_setzero_si128();
    2248       32688 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2249             :             reinterpret_cast<GByte *>(pDstData);
    2250     1499275 :         for (; n < nWordCount - 15; n += 16)
    2251             :         {
    2252     1466587 :             __m128i xmm = _mm_loadu_si128(
    2253     1466587 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2254     1466587 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2255     1466587 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2256             :             _mm_storeu_si128(
    2257     1466587 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2258             :             _mm_storeu_si128(
    2259     1466587 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2260             :         }
    2261      106775 :         for (; n < nWordCount; n++)
    2262             :         {
    2263       74087 :             pDstData[n] = pSrcData[n];
    2264       32688 :         }
    2265             :     }
    2266             :     else
    2267             :     {
    2268        6051 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2269             :                               nDstPixelStride, nWordCount);
    2270             :     }
    2271       38739 : }
    2272             : 
    2273             : template <>
    2274       25763 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2275             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2276             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2277             : {
    2278       25763 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2279             :                              nDstPixelStride, nWordCount);
    2280       25763 : }
    2281             : 
    2282             : template <>
    2283       12976 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2284             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2285             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2286             : {
    2287       12976 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2288             :                              nDstPixelStride, nWordCount);
    2289       12976 : }
    2290             : 
    2291             : template <class Tout>
    2292    12095237 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2293             :                               int nSrcPixelStride,
    2294             :                               Tout *const CPL_RESTRICT pDstData,
    2295             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2296             : {
    2297             :     static_assert(std::is_integral<Tout>::value &&
    2298             :                       sizeof(Tout) == sizeof(uint32_t),
    2299             :                   "Bad Tout");
    2300    12095237 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2301             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2302             :     {
    2303     6374127 :         decltype(nWordCount) n = 0;
    2304     6374127 :         const __m128i xmm_zero = _mm_setzero_si128();
    2305     6374127 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2306             :             reinterpret_cast<GByte *>(pDstData);
    2307    71873351 :         for (; n < nWordCount - 15; n += 16)
    2308             :         {
    2309    65430824 :             __m128i xmm = _mm_loadu_si128(
    2310    65430824 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2311    65508424 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2312    65554324 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2313    65386624 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2314    65401624 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2315    65431224 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2316    65499224 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2317             :             _mm_storeu_si128(
    2318    65499224 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2319             :             _mm_storeu_si128(
    2320    65499224 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2321             :             _mm_storeu_si128(
    2322    65499224 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2323             :             _mm_storeu_si128(
    2324    65499224 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2325             :         }
    2326    15168832 :         for (; n < nWordCount; n++)
    2327             :         {
    2328     8726335 :             pDstData[n] = pSrcData[n];
    2329     6442467 :         }
    2330             :     }
    2331             :     else
    2332             :     {
    2333     5721090 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2334             :                               nDstPixelStride, nWordCount);
    2335             :     }
    2336    12169837 : }
    2337             : 
    2338             : template <>
    2339         437 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2340             :                     int nSrcPixelStride, GUInt32 *const CPL_RESTRICT pDstData,
    2341             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2342             : {
    2343         437 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2344             :                              nDstPixelStride, nWordCount);
    2345         437 : }
    2346             : 
    2347             : template <>
    2348    12096900 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2349             :                     int nSrcPixelStride, GInt32 *const CPL_RESTRICT pDstData,
    2350             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2351             : {
    2352    12096900 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2353             :                              nDstPixelStride, nWordCount);
    2354    12103800 : }
    2355             : 
    2356             : template <>
    2357     2470370 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2358             :                     int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
    2359             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2360             : {
    2361     2470370 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2362             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2363             :     {
    2364      110924 :         decltype(nWordCount) n = 0;
    2365      110924 :         const __m128i xmm_zero = _mm_setzero_si128();
    2366      110924 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2367             :             reinterpret_cast<GByte *>(pDstData);
    2368     3271060 :         for (; n < nWordCount - 15; n += 16)
    2369             :         {
    2370     3160140 :             __m128i xmm = _mm_loadu_si128(
    2371     3160140 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2372     3160140 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2373     3160140 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2374     3160140 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2375     3160140 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2376     3160140 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2377     3160140 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2378     3160140 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2379     3160140 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2380     3160140 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2381     3160140 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2382     3160140 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2383             :                           xmm0_f);
    2384             :             _mm_storeu_ps(
    2385     3160140 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2386             :             _mm_storeu_ps(
    2387     3160140 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2388             :             _mm_storeu_ps(
    2389     3160140 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2390             :         }
    2391      471306 :         for (; n < nWordCount; n++)
    2392             :         {
    2393      360382 :             pDstData[n] = pSrcData[n];
    2394      110924 :         }
    2395             :     }
    2396             :     else
    2397             :     {
    2398     2359440 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2399             :                               nDstPixelStride, nWordCount);
    2400             :     }
    2401     2470370 : }
    2402             : 
    2403             : template <>
    2404      145276 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2405             :                     int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
    2406             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2407             : {
    2408      145276 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2409             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2410             :     {
    2411      122459 :         decltype(nWordCount) n = 0;
    2412      122459 :         const __m128i xmm_zero = _mm_setzero_si128();
    2413      122459 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2414             :             reinterpret_cast<GByte *>(pDstData);
    2415     1409260 :         for (; n < nWordCount - 15; n += 16)
    2416             :         {
    2417     1286800 :             __m128i xmm = _mm_loadu_si128(
    2418     1286800 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2419     1286800 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2420     1286800 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2421     1286800 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2422     1286800 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2423     1286800 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2424     1286800 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2425             : 
    2426     1286800 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2427     1286800 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2428     1286800 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2429     1286800 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2430     1286800 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2431     1286800 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2432     1286800 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2433     1286800 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2434     1286800 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2435     1286800 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2436     1286800 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2437     1286800 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2438             : 
    2439     1286800 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2440             :                           xmm0_low_d);
    2441             :             _mm_storeu_pd(
    2442     1286800 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2443             :                 xmm0_high_d);
    2444             :             _mm_storeu_pd(
    2445     1286800 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2446             :                 xmm1_low_d);
    2447             :             _mm_storeu_pd(
    2448     1286800 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2449             :                 xmm1_high_d);
    2450             :             _mm_storeu_pd(
    2451     1286800 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2452             :                 xmm2_low_d);
    2453             :             _mm_storeu_pd(
    2454     1286800 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2455             :                 xmm2_high_d);
    2456             :             _mm_storeu_pd(
    2457     1286800 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2458             :                 xmm3_low_d);
    2459             :             _mm_storeu_pd(
    2460     1286800 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2461             :                 xmm3_high_d);
    2462             :         }
    2463      232943 :         for (; n < nWordCount; n++)
    2464             :         {
    2465      110484 :             pDstData[n] = pSrcData[n];
    2466      122459 :         }
    2467             :     }
    2468             :     else
    2469             :     {
    2470       22817 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2471             :                               nDstPixelStride, nWordCount);
    2472             :     }
    2473      145276 : }
    2474             : 
    2475             : template <>
    2476        5992 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2477             :                     int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
    2478             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2479             : {
    2480        5992 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2481             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2482             :     {
    2483        5017 :         decltype(nWordCount) n = 0;
    2484             :         // In SSE2, min_epu16 does not exist, so shift from
    2485             :         // UInt16 to SInt16 to be able to use min_epi16
    2486        5017 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2487        5017 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2488      138457 :         for (; n < nWordCount - 7; n += 8)
    2489             :         {
    2490      133440 :             __m128i xmm = _mm_loadu_si128(
    2491      133440 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2492      133440 :             xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
    2493      133440 :             xmm = _mm_min_epi16(xmm, xmm_m255_shifted);
    2494      133440 :             xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
    2495      133440 :             xmm = _mm_packus_epi16(xmm, xmm);
    2496      133440 :             GDALCopyXMMToInt64(xmm,
    2497      133440 :                                reinterpret_cast<GPtrDiff_t *>(pDstData + n));
    2498             :         }
    2499       15907 :         for (; n < nWordCount; n++)
    2500             :         {
    2501       10890 :             pDstData[n] =
    2502       10890 :                 pSrcData[n] >= 255 ? 255 : static_cast<GByte>(pSrcData[n]);
    2503        5017 :         }
    2504             :     }
    2505             :     else
    2506             :     {
    2507         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2508             :                               nDstPixelStride, nWordCount);
    2509             :     }
    2510        5992 : }
    2511             : 
    2512             : template <>
    2513          21 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2514             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2515             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2516             : {
    2517          21 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2518             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2519             :     {
    2520          15 :         decltype(nWordCount) n = 0;
    2521             :         // In SSE2, min_epu16 does not exist, so shift from
    2522             :         // UInt16 to SInt16 to be able to use min_epi16
    2523          15 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2524          15 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2525          31 :         for (; n < nWordCount - 7; n += 8)
    2526             :         {
    2527          16 :             __m128i xmm = _mm_loadu_si128(
    2528          16 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2529          16 :             xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
    2530          16 :             xmm = _mm_min_epi16(xmm, xmm_32767_shifted);
    2531          16 :             xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
    2532          16 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm);
    2533             :         }
    2534          55 :         for (; n < nWordCount; n++)
    2535             :         {
    2536          40 :             pDstData[n] =
    2537          40 :                 pSrcData[n] >= 32767 ? 32767 : static_cast<GInt16>(pSrcData[n]);
    2538          15 :         }
    2539             :     }
    2540             :     else
    2541             :     {
    2542           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2543             :                               nDstPixelStride, nWordCount);
    2544             :     }
    2545          21 : }
    2546             : 
    2547             : template <>
    2548         412 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2549             :                     int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
    2550             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2551             : {
    2552         412 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2553             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2554             :     {
    2555         406 :         decltype(nWordCount) n = 0;
    2556         406 :         const __m128i xmm_zero = _mm_setzero_si128();
    2557         406 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2558             :             reinterpret_cast<GByte *>(pDstData);
    2559        1500 :         for (; n < nWordCount - 7; n += 8)
    2560             :         {
    2561        1094 :             __m128i xmm = _mm_loadu_si128(
    2562        1094 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2563        1094 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2564        1094 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2565        1094 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2566        1094 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2567        1094 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2568             :                           xmm0_f);
    2569             :             _mm_storeu_ps(
    2570        1094 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2571             :         }
    2572        1483 :         for (; n < nWordCount; n++)
    2573             :         {
    2574        1077 :             pDstData[n] = pSrcData[n];
    2575         406 :         }
    2576             :     }
    2577             :     else
    2578             :     {
    2579           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2580             :                               nDstPixelStride, nWordCount);
    2581             :     }
    2582         412 : }
    2583             : 
    2584             : template <>
    2585         279 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2586             :                     int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
    2587             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2588             : {
    2589         279 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2590             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2591             :     {
    2592         171 :         decltype(nWordCount) n = 0;
    2593         171 :         const __m128i xmm_zero = _mm_setzero_si128();
    2594         171 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2595             :             reinterpret_cast<GByte *>(pDstData);
    2596         219 :         for (; n < nWordCount - 7; n += 8)
    2597             :         {
    2598          48 :             __m128i xmm = _mm_loadu_si128(
    2599          48 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2600          48 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2601          48 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2602             : 
    2603          48 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2604          48 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2605          48 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2606          48 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2607          48 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2608          48 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2609             : 
    2610          48 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2611             :                           xmm0_low_d);
    2612             :             _mm_storeu_pd(
    2613          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2614             :                 xmm0_high_d);
    2615             :             _mm_storeu_pd(
    2616          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2617             :                 xmm1_low_d);
    2618             :             _mm_storeu_pd(
    2619          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2620             :                 xmm1_high_d);
    2621             :         }
    2622         429 :         for (; n < nWordCount; n++)
    2623             :         {
    2624         258 :             pDstData[n] = pSrcData[n];
    2625         171 :         }
    2626             :     }
    2627             :     else
    2628             :     {
    2629         108 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2630             :                               nDstPixelStride, nWordCount);
    2631             :     }
    2632         279 : }
    2633             : 
    2634             : template <>
    2635         789 : void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    2636             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2637             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2638             : {
    2639         789 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2640             :                             nDstPixelStride, nWordCount);
    2641         789 : }
    2642             : 
    2643             : #endif  // HAVE_SSE2
    2644             : 
    2645             : template <>
    2646      116659 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2647             :                     int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
    2648             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2649             : {
    2650      116659 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2651             :                             nDstPixelStride, nWordCount);
    2652      116660 : }
    2653             : 
    2654             : template <>
    2655       15146 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2656             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2657             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2658             : {
    2659       15146 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2660             :                             nDstPixelStride, nWordCount);
    2661       15146 : }
    2662             : 
    2663             : template <>
    2664       61642 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2665             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2666             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2667             : {
    2668       61642 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2669             :                             nDstPixelStride, nWordCount);
    2670       61638 : }
    2671             : 
    2672             : /************************************************************************/
    2673             : /*                   GDALCopyWordsComplexT()                            */
    2674             : /************************************************************************/
    2675             : /**
    2676             :  * Template function, used to copy data from pSrcData into buffer
    2677             :  * pDstData, with stride nSrcPixelStride in the source data and
    2678             :  * stride nDstPixelStride in the destination data. Deals with the
    2679             :  * complex case, where input is complex and output is complex.
    2680             :  *
    2681             :  * @param pSrcData the source data buffer
    2682             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2683             :  *                      of interest.
    2684             :  * @param pDstData the destination buffer.
    2685             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2686             :  *                      interest.
    2687             :  * @param nWordCount the total number of pixel words to copy
    2688             :  *
    2689             :  */
    2690             : template <class Tin, class Tout>
    2691      125132 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    2692             :                                   int nSrcPixelStride,
    2693             :                                   Tout *const CPL_RESTRICT pDstData,
    2694             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    2695             : {
    2696      125132 :     decltype(nWordCount) nDstOffset = 0;
    2697      125132 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2698      125132 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2699             : 
    2700     7337033 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2701             :     {
    2702     7211896 :         const Tin *const pPixelIn =
    2703     7211896 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    2704     7211896 :         Tout *const pPixelOut =
    2705     7211896 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2706             : 
    2707     7211896 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    2708     7211896 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    2709             : 
    2710     7211896 :         nDstOffset += nDstPixelStride;
    2711             :     }
    2712      125132 : }
    2713             : 
    2714             : /************************************************************************/
    2715             : /*                   GDALCopyWordsComplexOutT()                         */
    2716             : /************************************************************************/
    2717             : /**
    2718             :  * Template function, used to copy data from pSrcData into buffer
    2719             :  * pDstData, with stride nSrcPixelStride in the source data and
    2720             :  * stride nDstPixelStride in the destination data. Deals with the
    2721             :  * case where the value is real coming in, but complex going out.
    2722             :  *
    2723             :  * @param pSrcData the source data buffer
    2724             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2725             :  *                      of interest, in bytes.
    2726             :  * @param pDstData the destination buffer.
    2727             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2728             :  *                      interest, in bytes.
    2729             :  * @param nWordCount the total number of pixel words to copy
    2730             :  *
    2731             :  */
    2732             : template <class Tin, class Tout>
    2733        3164 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    2734             :                                      int nSrcPixelStride,
    2735             :                                      Tout *const CPL_RESTRICT pDstData,
    2736             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    2737             : {
    2738        3164 :     decltype(nWordCount) nDstOffset = 0;
    2739             : 
    2740        3164 :     const Tout tOutZero = static_cast<Tout>(0);
    2741             : 
    2742        3164 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2743        3164 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2744             : 
    2745     1112451 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2746             :     {
    2747     1109287 :         const Tin tValue =
    2748     1109287 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    2749     1109287 :         Tout *const pPixelOut =
    2750     1109287 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2751     1109287 :         GDALCopyWord(tValue, *pPixelOut);
    2752             : 
    2753     1109287 :         pPixelOut[1] = tOutZero;
    2754             : 
    2755     1109287 :         nDstOffset += nDstPixelStride;
    2756             :     }
    2757        3164 : }
    2758             : 
    2759             : /************************************************************************/
    2760             : /*                           GDALCopyWordsFromT()                       */
    2761             : /************************************************************************/
    2762             : /**
    2763             :  * Template driver function. Given the input type T, call the appropriate
    2764             :  * GDALCopyWordsT function template for the desired output type. You should
    2765             :  * never call this function directly (call GDALCopyWords instead).
    2766             :  *
    2767             :  * @param pSrcData source data buffer
    2768             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    2769             :  * @param bInComplex input is complex
    2770             :  * @param pDstData destination data buffer
    2771             :  * @param eDstType destination data type
    2772             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    2773             :  * @param nWordCount number of pixel words to be copied
    2774             :  */
    2775             : template <class T>
    2776    53312260 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    2777             :                                int nSrcPixelStride, bool bInComplex,
    2778             :                                void *CPL_RESTRICT pDstData,
    2779             :                                GDALDataType eDstType, int nDstPixelStride,
    2780             :                                GPtrDiff_t nWordCount)
    2781             : {
    2782    53312260 :     switch (eDstType)
    2783             :     {
    2784     4556357 :         case GDT_Byte:
    2785     4556357 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2786             :                            static_cast<unsigned char *>(pDstData),
    2787             :                            nDstPixelStride, nWordCount);
    2788     4556590 :             break;
    2789         457 :         case GDT_Int8:
    2790         457 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2791             :                            static_cast<signed char *>(pDstData),
    2792             :                            nDstPixelStride, nWordCount);
    2793         457 :             break;
    2794      101112 :         case GDT_UInt16:
    2795      101112 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2796             :                            static_cast<unsigned short *>(pDstData),
    2797             :                            nDstPixelStride, nWordCount);
    2798      101114 :             break;
    2799     4125678 :         case GDT_Int16:
    2800     4125678 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2801             :                            static_cast<short *>(pDstData), nDstPixelStride,
    2802             :                            nWordCount);
    2803     4125678 :             break;
    2804        4179 :         case GDT_UInt32:
    2805        4179 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2806             :                            static_cast<unsigned int *>(pDstData),
    2807             :                            nDstPixelStride, nWordCount);
    2808        4179 :             break;
    2809    25277683 :         case GDT_Int32:
    2810    25277683 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2811             :                            static_cast<int *>(pDstData), nDstPixelStride,
    2812             :                            nWordCount);
    2813    25288283 :             break;
    2814         592 :         case GDT_UInt64:
    2815         592 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2816             :                            static_cast<std::uint64_t *>(pDstData),
    2817             :                            nDstPixelStride, nWordCount);
    2818         592 :             break;
    2819        4157 :         case GDT_Int64:
    2820        4157 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2821             :                            static_cast<std::int64_t *>(pDstData),
    2822             :                            nDstPixelStride, nWordCount);
    2823        4157 :             break;
    2824     3869075 :         case GDT_Float32:
    2825     3869075 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2826             :                            static_cast<float *>(pDstData), nDstPixelStride,
    2827             :                            nWordCount);
    2828     3869075 :             break;
    2829    15243095 :         case GDT_Float64:
    2830    15243095 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2831             :                            static_cast<double *>(pDstData), nDstPixelStride,
    2832             :                            nWordCount);
    2833    15243145 :             break;
    2834      122400 :         case GDT_CInt16:
    2835      122400 :             if (bInComplex)
    2836             :             {
    2837      121390 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2838             :                                       static_cast<short *>(pDstData),
    2839             :                                       nDstPixelStride, nWordCount);
    2840             :             }
    2841             :             else  // input is not complex, so we need to promote to a complex
    2842             :                   // buffer
    2843             :             {
    2844        1010 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2845             :                                          static_cast<short *>(pDstData),
    2846             :                                          nDstPixelStride, nWordCount);
    2847             :             }
    2848      122400 :             break;
    2849         779 :         case GDT_CInt32:
    2850         779 :             if (bInComplex)
    2851             :             {
    2852         391 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2853             :                                       static_cast<int *>(pDstData),
    2854             :                                       nDstPixelStride, nWordCount);
    2855             :             }
    2856             :             else  // input is not complex, so we need to promote to a complex
    2857             :                   // buffer
    2858             :             {
    2859         388 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2860             :                                          static_cast<int *>(pDstData),
    2861             :                                          nDstPixelStride, nWordCount);
    2862             :             }
    2863         779 :             break;
    2864        3170 :         case GDT_CFloat32:
    2865        3170 :             if (bInComplex)
    2866             :             {
    2867        2589 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2868             :                                       static_cast<float *>(pDstData),
    2869             :                                       nDstPixelStride, nWordCount);
    2870             :             }
    2871             :             else  // input is not complex, so we need to promote to a complex
    2872             :                   // buffer
    2873             :             {
    2874         581 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2875             :                                          static_cast<float *>(pDstData),
    2876             :                                          nDstPixelStride, nWordCount);
    2877             :             }
    2878        3170 :             break;
    2879        1947 :         case GDT_CFloat64:
    2880        1947 :             if (bInComplex)
    2881             :             {
    2882         762 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2883             :                                       static_cast<double *>(pDstData),
    2884             :                                       nDstPixelStride, nWordCount);
    2885             :             }
    2886             :             else  // input is not complex, so we need to promote to a complex
    2887             :                   // buffer
    2888             :             {
    2889        1185 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2890             :                                          static_cast<double *>(pDstData),
    2891             :                                          nDstPixelStride, nWordCount);
    2892             :             }
    2893        1947 :             break;
    2894           0 :         case GDT_Unknown:
    2895             :         case GDT_TypeCount:
    2896           0 :             CPLAssert(false);
    2897             :     }
    2898    53323015 : }
    2899             : 
    2900             : }  // end anonymous namespace
    2901             : 
    2902             : /************************************************************************/
    2903             : /*                          GDALReplicateWord()                         */
    2904             : /************************************************************************/
    2905             : 
    2906             : template <class T>
    2907      227440 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    2908             :                                GPtrDiff_t nWordCount)
    2909             : {
    2910      227440 :     const T valSet = *static_cast<const T *>(pDstData);
    2911      227440 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    2912             :     {
    2913      199069 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    2914     8614812 :         while (nWordCount >= 4)
    2915             :         {
    2916     8415740 :             nWordCount -= 4;
    2917     8415740 :             pDstPtr[0] = valSet;
    2918     8415740 :             pDstPtr[1] = valSet;
    2919     8415740 :             pDstPtr[2] = valSet;
    2920     8415740 :             pDstPtr[3] = valSet;
    2921     8415740 :             pDstPtr += 4;
    2922             :         }
    2923      664645 :         while (nWordCount > 0)
    2924             :         {
    2925      465576 :             --nWordCount;
    2926      465576 :             *pDstPtr = valSet;
    2927      465576 :             pDstPtr++;
    2928             :         }
    2929             :     }
    2930             :     else
    2931             :     {
    2932       28371 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    2933      954250 :         while (nWordCount > 0)
    2934             :         {
    2935      925879 :             --nWordCount;
    2936      925879 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    2937      925879 :             pabyDstPtr += nDstPixelStride;
    2938             :         }
    2939             :     }
    2940      227440 : }
    2941             : 
    2942      592282 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    2943             :                               GDALDataType eSrcType,
    2944             :                               void *CPL_RESTRICT pDstData,
    2945             :                               GDALDataType eDstType, int nDstPixelStride,
    2946             :                               GPtrDiff_t nWordCount)
    2947             : {
    2948             :     /* -----------------------------------------------------------------------
    2949             :      */
    2950             :     /* Special case when the source data is always the same value */
    2951             :     /* (for VRTSourcedRasterBand::IRasterIO and
    2952             :      * VRTDerivedRasterBand::IRasterIO*/
    2953             :     /*  for example) */
    2954             :     /* -----------------------------------------------------------------------
    2955             :      */
    2956             :     // Let the general translation case do the necessary conversions
    2957             :     // on the first destination element.
    2958      592282 :     GDALCopyWords(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    2959             : 
    2960             :     // Now copy the first element to the nWordCount - 1 following destination
    2961             :     // elements.
    2962      592282 :     nWordCount--;
    2963      592282 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    2964             : 
    2965      592282 :     switch (eDstType)
    2966             :     {
    2967      364758 :         case GDT_Byte:
    2968             :         case GDT_Int8:
    2969             :         {
    2970      364758 :             if (nDstPixelStride == 1)
    2971             :             {
    2972      331006 :                 if (nWordCount > 0)
    2973      331006 :                     memset(pabyDstWord,
    2974      331006 :                            *reinterpret_cast<const GByte *>(pDstData),
    2975             :                            nWordCount);
    2976             :             }
    2977             :             else
    2978             :             {
    2979       33752 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    2980     5438530 :                 while (nWordCount > 0)
    2981             :                 {
    2982     5404780 :                     --nWordCount;
    2983     5404780 :                     *pabyDstWord = valSet;
    2984     5404780 :                     pabyDstWord += nDstPixelStride;
    2985             :                 }
    2986             :             }
    2987      364758 :             break;
    2988             :         }
    2989             : 
    2990             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    2991             :     case enum_type:                                                            \
    2992             :     {                                                                          \
    2993             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    2994             :         break;                                                                 \
    2995             :     }
    2996             : 
    2997         333 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    2998      169647 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    2999          56 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3000         140 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3001          21 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3002          22 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3003       52210 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3004        5011 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3005             : 
    3006             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3007             :     case enum_type:                                                            \
    3008             :     {                                                                          \
    3009             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3010             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3011             :         while (nWordCount > 0)                                                 \
    3012             :         {                                                                      \
    3013             :             --nWordCount;                                                      \
    3014             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3015             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3016             :             pabyDstWord += nDstPixelStride;                                    \
    3017             :         }                                                                      \
    3018             :         break;                                                                 \
    3019             :     }
    3020             : 
    3021         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3022         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3023         784 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3024         784 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3025             : 
    3026           0 :         case GDT_Unknown:
    3027             :         case GDT_TypeCount:
    3028           0 :             CPLAssert(false);
    3029             :     }
    3030      592282 : }
    3031             : 
    3032             : /************************************************************************/
    3033             : /*                        GDALUnrolledCopy()                            */
    3034             : /************************************************************************/
    3035             : 
    3036             : template <class T, int srcStride, int dstStride>
    3037     5319771 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3038             :                                            const T *CPL_RESTRICT pSrc,
    3039             :                                            GPtrDiff_t nIters)
    3040             : {
    3041     5319771 :     if (nIters >= 16)
    3042             :     {
    3043   138118800 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3044             :         {
    3045   132928679 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3046   132928679 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3047   132928679 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3048   132928679 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3049   132928679 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3050   132928679 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3051   132928679 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3052   132928679 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3053   132928679 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3054   132928679 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3055   132928679 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3056   132928679 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3057   132928679 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3058   132928679 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3059   132928679 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3060   132928679 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3061   132928679 :             pDest += 16 * dstStride;
    3062   132928679 :             pSrc += 16 * srcStride;
    3063             :         }
    3064     5190161 :         nIters = nIters % 16;
    3065             :     }
    3066     7571813 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3067             :     {
    3068     2252029 :         pDest[i * dstStride] = *pSrc;
    3069     2252029 :         pSrc += srcStride;
    3070             :     }
    3071     5319771 : }
    3072             : 
    3073             : template <class T, int srcStride, int dstStride>
    3074     5314586 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3075             :                                     const T *CPL_RESTRICT pSrc,
    3076             :                                     GPtrDiff_t nIters)
    3077             : {
    3078     5314586 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3079     5314628 : }
    3080             : 
    3081             : #ifdef HAVE_SSE2
    3082             : 
    3083             : template <>
    3084      303485 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3085             :                                    const GByte *CPL_RESTRICT pSrc,
    3086             :                                    GPtrDiff_t nIters)
    3087             : {
    3088      303485 :     decltype(nIters) i = 0;
    3089      303485 :     if (nIters > 16)
    3090             :     {
    3091      145295 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3092             :         // If we were sure that there would always be 1 trailing byte, we could
    3093             :         // check against nIters - 15
    3094     2535420 :         for (; i < nIters - 16; i += 16)
    3095             :         {
    3096             :             __m128i xmm0 =
    3097     2390130 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3098             :             __m128i xmm1 =
    3099     4780250 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3100             :             // Set higher 8bit of each int16 packed word to 0
    3101     2390130 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3102     2390130 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3103             :             // Pack int16 to uint8 and merge back both vector
    3104     2390130 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3105             : 
    3106             :             // Store result
    3107     2390130 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3108             : 
    3109     2390130 :             pSrc += 2 * 16;
    3110             :         }
    3111             :     }
    3112     3866570 :     for (; i < nIters; i++)
    3113             :     {
    3114     3563080 :         pDest[i] = *pSrc;
    3115     3563080 :         pSrc += 2;
    3116             :     }
    3117      303485 : }
    3118             : 
    3119             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3120             : 
    3121             : template <>
    3122      184624 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3123             :                                    const GByte *CPL_RESTRICT pSrc,
    3124             :                                    GPtrDiff_t nIters)
    3125             : {
    3126      184624 :     if (nIters > 16 && CPLHaveRuntimeSSSE3())
    3127             :     {
    3128      179427 :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3129             :     }
    3130             :     else
    3131             :     {
    3132        5197 :         GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
    3133             :     }
    3134      184624 : }
    3135             : 
    3136             : #endif
    3137             : 
    3138             : template <>
    3139      100208 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    3140             :                                    const GByte *CPL_RESTRICT pSrc,
    3141             :                                    GPtrDiff_t nIters)
    3142             : {
    3143      100208 :     decltype(nIters) i = 0;
    3144      100208 :     if (nIters > 16)
    3145             :     {
    3146       94915 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    3147             :         // If we were sure that there would always be 3 trailing bytes, we could
    3148             :         // check against nIters - 15
    3149     8731670 :         for (; i < nIters - 16; i += 16)
    3150             :         {
    3151             :             __m128i xmm0 =
    3152     8636590 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3153             :             __m128i xmm1 =
    3154     8636590 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3155             :             __m128i xmm2 =
    3156     8636590 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3157             :             __m128i xmm3 =
    3158    17273200 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    3159             :             // Set higher 24bit of each int32 packed word to 0
    3160     8636590 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3161     8636590 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3162     8636590 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    3163     8636590 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    3164             :             // Pack int32 to int16
    3165     8636900 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    3166     8636670 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    3167             :             // Pack int16 to uint8
    3168     8636760 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    3169             : 
    3170             :             // Store result
    3171     8636760 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3172             : 
    3173     8636760 :             pSrc += 4 * 16;
    3174             :         }
    3175             :     }
    3176     1100560 :     for (; i < nIters; i++)
    3177             :     {
    3178     1000190 :         pDest[i] = *pSrc;
    3179     1000190 :         pSrc += 4;
    3180             :     }
    3181      100372 : }
    3182             : #endif  // HAVE_SSE2
    3183             : 
    3184             : /************************************************************************/
    3185             : /*                         GDALFastCopy()                               */
    3186             : /************************************************************************/
    3187             : 
    3188             : template <class T>
    3189    39773800 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    3190             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    3191             :                                 GPtrDiff_t nIters)
    3192             : {
    3193    39773800 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    3194    39773800 :     if (nIters == 1)
    3195             :     {
    3196    22288380 :         *pDest = *pSrc;
    3197             :     }
    3198    17485412 :     else if (nDestStride == sizeofT)
    3199             :     {
    3200    12233224 :         if (nSrcStride == sizeofT)
    3201             :         {
    3202    11502469 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    3203             :         }
    3204      730703 :         else if (nSrcStride == 2 * sizeofT)
    3205             :         {
    3206      306426 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    3207             :         }
    3208      424277 :         else if (nSrcStride == 3 * sizeofT)
    3209             :         {
    3210      289484 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    3211             :         }
    3212      134793 :         else if (nSrcStride == 4 * sizeofT)
    3213             :         {
    3214      129076 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    3215             :         }
    3216             :         else
    3217             :         {
    3218    12969320 :             while (nIters-- > 0)
    3219             :             {
    3220    12963630 :                 *pDest = *pSrc;
    3221    12963630 :                 pSrc += nSrcStride / sizeofT;
    3222    12963630 :                 pDest++;
    3223             :             }
    3224             :         }
    3225             :     }
    3226     5252258 :     else if (nSrcStride == sizeofT)
    3227             :     {
    3228     5237338 :         if (nDestStride == 2 * sizeofT)
    3229             :         {
    3230      125525 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    3231             :         }
    3232     5111809 :         else if (nDestStride == 3 * sizeofT)
    3233             :         {
    3234     4409379 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    3235             :         }
    3236      702427 :         else if (nDestStride == 4 * sizeofT)
    3237             :         {
    3238      643004 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    3239             :         }
    3240             :         else
    3241             :         {
    3242    12650500 :             while (nIters-- > 0)
    3243             :             {
    3244    12591080 :                 *pDest = *pSrc;
    3245    12591080 :                 pSrc++;
    3246    12591080 :                 pDest += nDestStride / sizeofT;
    3247             :             }
    3248             :         }
    3249             :     }
    3250             :     else
    3251             :     {
    3252     1113808 :         while (nIters-- > 0)
    3253             :         {
    3254     1098888 :             *pDest = *pSrc;
    3255     1098888 :             pSrc += nSrcStride / sizeofT;
    3256     1098888 :             pDest += nDestStride / sizeofT;
    3257             :         }
    3258             :     }
    3259    39773800 : }
    3260             : 
    3261             : /************************************************************************/
    3262             : /*                         GDALFastCopyByte()                           */
    3263             : /************************************************************************/
    3264             : 
    3265      276307 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    3266             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    3267             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    3268             : {
    3269      276307 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    3270             :                  nWordCount);
    3271      276307 : }
    3272             : 
    3273             : /************************************************************************/
    3274             : /*                           GDALCopyWords()                            */
    3275             : /************************************************************************/
    3276             : 
    3277             : /**
    3278             :  * Copy pixel words from buffer to buffer.
    3279             :  *
    3280             :  * @see GDALCopyWords64()
    3281             :  */
    3282   101097000 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    3283             :                                GDALDataType eSrcType, int nSrcPixelStride,
    3284             :                                void *CPL_RESTRICT pDstData,
    3285             :                                GDALDataType eDstType, int nDstPixelStride,
    3286             :                                int nWordCount)
    3287             : {
    3288   101097000 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    3289             :                     nDstPixelStride, nWordCount);
    3290   101096000 : }
    3291             : 
    3292             : /************************************************************************/
    3293             : /*                          GDALCopyWords64()                           */
    3294             : /************************************************************************/
    3295             : 
    3296             : /**
    3297             :  * Copy pixel words from buffer to buffer.
    3298             :  *
    3299             :  * This function is used to copy pixel word values from one memory buffer
    3300             :  * to another, with support for conversion between data types, and differing
    3301             :  * step factors. The data type conversion is done using the following
    3302             :  * rules:
    3303             :  * <ul>
    3304             :  * <li>Values assigned to a lower range integer type are clipped. For
    3305             :  * instance assigning GDT_Int16 values to a GDT_Byte buffer will cause values
    3306             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    3307             :  * </li>
    3308             :  * <li>
    3309             :  * Assignment from floating point to integer rounds to closest integer.
    3310             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    3311             :  * smallest integer. NaN is mapped to 0.
    3312             :  * </li>
    3313             :  * <li>
    3314             :  * Assignment from non-complex to complex will result in the imaginary part
    3315             :  * being set to zero on output.
    3316             :  * </li>
    3317             :  * <li> Assignment from complex to
    3318             :  * non-complex will result in the complex portion being lost and the real
    3319             :  * component being preserved (<i>not magnitude!</i>).
    3320             :  * </li>
    3321             :  * </ul>
    3322             :  *
    3323             :  * No assumptions are made about the source or destination words occurring
    3324             :  * on word boundaries.  It is assumed that all values are in native machine
    3325             :  * byte order.
    3326             :  *
    3327             :  * @param pSrcData Pointer to source data to be converted.
    3328             :  * @param eSrcType the source data type (see GDALDataType enum)
    3329             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    3330             :  * in bytes
    3331             :  * @param pDstData Pointer to buffer where destination data should go
    3332             :  * @param eDstType the destination data type (see GDALDataType enum)
    3333             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    3334             :  * words), in bytes
    3335             :  * @param nWordCount number of words to be copied
    3336             :  *
    3337             :  * @note
    3338             :  * When adding a new data type to GDAL, you must do the following to
    3339             :  * support it properly within the GDALCopyWords function:
    3340             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    3341             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    3342             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    3343             :  *    This should call the appropriate GDALCopyWordsT template.
    3344             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    3345             :  *    above namespace. This will ensure that any conversion issues are
    3346             :  *    handled (cases like the float -> int32 case, where the min/max)
    3347             :  *    values are subject to roundoff error.
    3348             :  */
    3349             : 
    3350   107721000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    3351             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    3352             :                                  void *CPL_RESTRICT pDstData,
    3353             :                                  GDALDataType eDstType, int nDstPixelStride,
    3354             :                                  GPtrDiff_t nWordCount)
    3355             : 
    3356             : {
    3357             :     // On platforms where alignment matters, be careful
    3358   107721000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    3359   107710000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    3360   107711000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    3361             :     {
    3362           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    3363             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    3364             :                  "argument");
    3365           2 :         return;
    3366             :     }
    3367   107711000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    3368    59699200 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    3369    59691100 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    3370    59689800 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    3371    59687900 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    3372             :     {
    3373         905 :         if (eSrcType == eDstType)
    3374             :         {
    3375       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3376             :             {
    3377       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3378             :                        static_cast<const GByte *>(pSrcData) +
    3379       34000 :                            nSrcPixelStride * i,
    3380             :                        nDstDataTypeSize);
    3381             :             }
    3382             :         }
    3383             :         else
    3384             :         {
    3385         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    3386             :             {
    3387             :                 return ptr +
    3388         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    3389         210 :                         align);
    3390             :             };
    3391             : 
    3392             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    3393             :             // be sure to get correctly aligned pointer.
    3394         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    3395             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    3396             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    3397             :             GByte *pabySrcBuffer =
    3398         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    3399             :             GByte *pabyDstBuffer =
    3400         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    3401        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3402             :             {
    3403        3255 :                 memcpy(pabySrcBuffer,
    3404             :                        static_cast<const GByte *>(pSrcData) +
    3405        3255 :                            nSrcPixelStride * i,
    3406             :                        nSrcDataTypeSize);
    3407        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    3408             :                                 eDstType, 0, 1);
    3409        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3410             :                        pabyDstBuffer, nDstDataTypeSize);
    3411             :             }
    3412             :         }
    3413         905 :         return;
    3414             :     }
    3415             : 
    3416             :     // Deal with the case where we're replicating a single word into the
    3417             :     // provided buffer
    3418   107711000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    3419             :     {
    3420      592282 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    3421             :                           nDstPixelStride, nWordCount);
    3422      592282 :         return;
    3423             :     }
    3424             : 
    3425   107118000 :     if (eSrcType == eDstType)
    3426             :     {
    3427    53966300 :         if (eSrcType == GDT_Byte || eSrcType == GDT_Int8)
    3428             :         {
    3429    18533000 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    3430             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    3431             :                          nWordCount);
    3432    18533500 :             return;
    3433             :         }
    3434             : 
    3435    35433300 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    3436    20964600 :             (nDstPixelStride % 2) == 0)
    3437             :         {
    3438    20964600 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    3439             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    3440             :                          nWordCount);
    3441    20964100 :             return;
    3442             :         }
    3443             : 
    3444    14468700 :         if (nWordCount == 1)
    3445             :         {
    3446             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    3447             :             // Avoid false positives...
    3448             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    3449             : #else
    3450    14056100 :             if (nSrcDataTypeSize == 2)
    3451           0 :                 memcpy(pDstData, pSrcData, 2);
    3452    14056100 :             else if (nSrcDataTypeSize == 4)
    3453    14014000 :                 memcpy(pDstData, pSrcData, 4);
    3454       42152 :             else if (nSrcDataTypeSize == 8)
    3455       25636 :                 memcpy(pDstData, pSrcData, 8);
    3456             :             else /* if( eSrcType == GDT_CFloat64 ) */
    3457       16516 :                 memcpy(pDstData, pSrcData, 16);
    3458             : #endif
    3459    14056100 :             return;
    3460             :         }
    3461             : 
    3462             :         // Let memcpy() handle the case where we're copying a packed buffer
    3463             :         // of pixels.
    3464      412638 :         if (nSrcPixelStride == nDstPixelStride)
    3465             :         {
    3466      259091 :             if (nSrcPixelStride == nSrcDataTypeSize)
    3467             :             {
    3468      256921 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    3469      256921 :                 return;
    3470             :             }
    3471             :         }
    3472             :     }
    3473             : 
    3474             :     // Handle the more general case -- deals with conversion of data types
    3475             :     // directly.
    3476    53307600 :     switch (eSrcType)
    3477             :     {
    3478    14751600 :         case GDT_Byte:
    3479    14751600 :             GDALCopyWordsFromT<unsigned char>(
    3480             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    3481             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3482    14760800 :             break;
    3483         942 :         case GDT_Int8:
    3484         942 :             GDALCopyWordsFromT<signed char>(
    3485             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    3486             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3487         942 :             break;
    3488       52171 :         case GDT_UInt16:
    3489       52171 :             GDALCopyWordsFromT<unsigned short>(
    3490             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    3491             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3492       52171 :             break;
    3493     4541780 :         case GDT_Int16:
    3494     4541780 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3495             :                                       nSrcPixelStride, false, pDstData,
    3496             :                                       eDstType, nDstPixelStride, nWordCount);
    3497     4541800 :             break;
    3498        5813 :         case GDT_UInt32:
    3499        5813 :             GDALCopyWordsFromT<unsigned int>(
    3500             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    3501             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3502        5813 :             break;
    3503    12254600 :         case GDT_Int32:
    3504    12254600 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3505             :                                     nSrcPixelStride, false, pDstData, eDstType,
    3506             :                                     nDstPixelStride, nWordCount);
    3507    12254600 :             break;
    3508         496 :         case GDT_UInt64:
    3509         496 :             GDALCopyWordsFromT<std::uint64_t>(
    3510             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    3511             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3512         496 :             break;
    3513        7126 :         case GDT_Int64:
    3514        7126 :             GDALCopyWordsFromT<std::int64_t>(
    3515             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    3516             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3517        7126 :             break;
    3518      278504 :         case GDT_Float32:
    3519      278504 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3520             :                                       nSrcPixelStride, false, pDstData,
    3521             :                                       eDstType, nDstPixelStride, nWordCount);
    3522      278499 :             break;
    3523    20677600 :         case GDT_Float64:
    3524    20677600 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3525             :                                        nSrcPixelStride, false, pDstData,
    3526             :                                        eDstType, nDstPixelStride, nWordCount);
    3527    20677700 :             break;
    3528      566927 :         case GDT_CInt16:
    3529      566927 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3530             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3531             :                                       nDstPixelStride, nWordCount);
    3532      566927 :             break;
    3533         383 :         case GDT_CInt32:
    3534         383 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3535             :                                     nSrcPixelStride, true, pDstData, eDstType,
    3536             :                                     nDstPixelStride, nWordCount);
    3537         383 :             break;
    3538        1323 :         case GDT_CFloat32:
    3539        1323 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3540             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3541             :                                       nDstPixelStride, nWordCount);
    3542        1323 :             break;
    3543      172460 :         case GDT_CFloat64:
    3544      172460 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3545             :                                        nSrcPixelStride, true, pDstData,
    3546             :                                        eDstType, nDstPixelStride, nWordCount);
    3547      172460 :             break;
    3548           0 :         case GDT_Unknown:
    3549             :         case GDT_TypeCount:
    3550           0 :             CPLAssert(false);
    3551             :     }
    3552             : }
    3553             : 
    3554             : /************************************************************************/
    3555             : /*                            GDALCopyBits()                            */
    3556             : /************************************************************************/
    3557             : 
    3558             : /**
    3559             :  * Bitwise word copying.
    3560             :  *
    3561             :  * A function for moving sets of partial bytes around.  Loosely
    3562             :  * speaking this is a bitwise analog to GDALCopyWords().
    3563             :  *
    3564             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    3565             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    3566             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    3567             :  * and nDstOffset are the offset into the source and destination buffers
    3568             :  * to start at, also measured in bits.
    3569             :  *
    3570             :  * All bit offsets are assumed to start from the high order bit in a byte
    3571             :  * (i.e. most significant bit first).  Currently this function is not very
    3572             :  * optimized, but it may be improved for some common cases in the future
    3573             :  * as needed.
    3574             :  *
    3575             :  * @param pabySrcData the source data buffer.
    3576             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    3577             :  * first word to copy.
    3578             :  * @param nSrcStep the offset in bits from the start one source word to the
    3579             :  * start of the next.
    3580             :  * @param pabyDstData the destination data buffer.
    3581             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    3582             :  * first word to copy over.
    3583             :  * @param nDstStep the offset in bits from the start one word to the
    3584             :  * start of the next.
    3585             :  * @param nBitCount the number of bits in a word to be copied.
    3586             :  * @param nStepCount the number of words to copy.
    3587             :  */
    3588             : 
    3589           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    3590             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    3591             :                   int nBitCount, int nStepCount)
    3592             : 
    3593             : {
    3594           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    3595             : 
    3596           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    3597             :     {
    3598           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    3599             :         {
    3600           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    3601           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    3602             :             else
    3603           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    3604             : 
    3605           0 :             nSrcOffset++;
    3606           0 :             nDstOffset++;
    3607             :         }
    3608             : 
    3609           0 :         nSrcOffset += (nSrcStep - nBitCount);
    3610           0 :         nDstOffset += (nDstStep - nBitCount);
    3611             :     }
    3612             : }
    3613             : 
    3614             : /************************************************************************/
    3615             : /*                    GDALGetBestOverviewLevel()                        */
    3616             : /*                                                                      */
    3617             : /* Returns the best overview level to satisfy the query or -1 if none   */
    3618             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    3619             : /* returning a valid overview level                                     */
    3620             : /************************************************************************/
    3621             : 
    3622           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    3623             :                                  int &nXSize, int &nYSize, int nBufXSize,
    3624             :                                  int nBufYSize)
    3625             : {
    3626           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    3627           0 :                                          nBufXSize, nBufYSize, nullptr);
    3628             : }
    3629             : 
    3630      322801 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    3631             :                                   int &nYOff, int &nXSize, int &nYSize,
    3632             :                                   int nBufXSize, int nBufYSize,
    3633             :                                   GDALRasterIOExtraArg *psExtraArg)
    3634             : {
    3635             :     /* -------------------------------------------------------------------- */
    3636             :     /*      Compute the desired downsampling factor.  It is                 */
    3637             :     /*      based on the least reduced axis, and represents the number      */
    3638             :     /*      of source pixels to one destination pixel.                      */
    3639             :     /* -------------------------------------------------------------------- */
    3640      322801 :     const double dfDesiredDownsamplingFactor =
    3641      322801 :         ((nXSize / static_cast<double>(nBufXSize)) <
    3642      160465 :              (nYSize / static_cast<double>(nBufYSize)) ||
    3643             :          nBufYSize == 1)
    3644      354177 :             ? nXSize / static_cast<double>(nBufXSize)
    3645      129089 :             : nYSize / static_cast<double>(nBufYSize);
    3646             : 
    3647             :     /* -------------------------------------------------------------------- */
    3648             :     /*      Find the overview level that largest downsampling factor (most  */
    3649             :     /*      downsampled) that is still less than (or only a little more)    */
    3650             :     /*      downsampled than the request.                                   */
    3651             :     /* -------------------------------------------------------------------- */
    3652      322801 :     const int nOverviewCount = poBand->GetOverviewCount();
    3653      322801 :     GDALRasterBand *poBestOverview = nullptr;
    3654      322801 :     double dfBestDownsamplingFactor = 0;
    3655      322801 :     int nBestOverviewLevel = -1;
    3656             : 
    3657             :     const char *pszOversampligThreshold =
    3658      322801 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    3659             : 
    3660             :     // Note: keep this logic for overview selection in sync between
    3661             :     // gdalwarp_lib.cpp and rasterio.cpp
    3662             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    3663             :     const double dfOversamplingThreshold =
    3664      645593 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    3665      322792 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    3666      645584 :             ? 1.0
    3667      322801 :             : 1.2;
    3668      325491 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    3669             :     {
    3670        5526 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    3671       11052 :         if (poOverview == nullptr ||
    3672       11051 :             poOverview->GetXSize() > poBand->GetXSize() ||
    3673        5525 :             poOverview->GetYSize() > poBand->GetYSize())
    3674             :         {
    3675           1 :             continue;
    3676             :         }
    3677             : 
    3678             :         // Compute downsampling factor of this overview
    3679             :         const double dfDownsamplingFactor = std::min(
    3680        5525 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    3681       11050 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    3682             : 
    3683             :         // Is it nearly the requested factor and better (lower) than
    3684             :         // the current best factor?
    3685             :         // Use an epsilon because of numerical instability.
    3686        5525 :         constexpr double EPSILON = 1e-1;
    3687        5633 :         if (dfDownsamplingFactor >=
    3688        5525 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold +
    3689        5417 :                     EPSILON ||
    3690             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    3691             :         {
    3692         108 :             continue;
    3693             :         }
    3694             : 
    3695             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    3696        5417 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    3697             : 
    3698        5417 :         if (pszResampling != nullptr &&
    3699          69 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    3700          16 :             continue;
    3701             : 
    3702             :         // OK, this is our new best overview.
    3703        5401 :         poBestOverview = poOverview;
    3704        5401 :         nBestOverviewLevel = iOverview;
    3705        5401 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    3706             : 
    3707        5401 :         if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
    3708             :             EPSILON)
    3709             :         {
    3710        2836 :             break;
    3711             :         }
    3712             :     }
    3713             : 
    3714             :     /* -------------------------------------------------------------------- */
    3715             :     /*      If we didn't find an overview that helps us, just return        */
    3716             :     /*      indicating failure and the full resolution image will be used.  */
    3717             :     /* -------------------------------------------------------------------- */
    3718      322801 :     if (nBestOverviewLevel < 0)
    3719      319898 :         return -1;
    3720             : 
    3721             :     /* -------------------------------------------------------------------- */
    3722             :     /*      Recompute the source window in terms of the selected            */
    3723             :     /*      overview.                                                       */
    3724             :     /* -------------------------------------------------------------------- */
    3725             :     const double dfXFactor =
    3726        2903 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    3727             :     const double dfYFactor =
    3728        2903 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    3729        2903 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    3730             :              poBestOverview->GetYSize());
    3731             : 
    3732        8709 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    3733        2903 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    3734        8709 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    3735        2903 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    3736        2903 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    3737        2903 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    3738        2903 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    3739           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    3740        2903 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    3741           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    3742             : 
    3743        2903 :     if (psExtraArg)
    3744             :     {
    3745        2903 :         if (psExtraArg->bFloatingPointWindowValidity)
    3746             :         {
    3747          45 :             psExtraArg->dfXOff /= dfXFactor;
    3748          45 :             psExtraArg->dfXSize /= dfXFactor;
    3749          45 :             psExtraArg->dfYOff /= dfYFactor;
    3750          45 :             psExtraArg->dfYSize /= dfYFactor;
    3751             :         }
    3752        2858 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    3753             :         {
    3754          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    3755          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    3756          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    3757          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    3758          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    3759             :         }
    3760             :     }
    3761             : 
    3762        2903 :     nXOff = nOXOff;
    3763        2903 :     nYOff = nOYOff;
    3764        2903 :     nXSize = nOXSize;
    3765        2903 :     nYSize = nOYSize;
    3766             : 
    3767        2903 :     return nBestOverviewLevel;
    3768             : }
    3769             : 
    3770             : /************************************************************************/
    3771             : /*                          OverviewRasterIO()                          */
    3772             : /*                                                                      */
    3773             : /*      Special work function to utilize available overviews to         */
    3774             : /*      more efficiently satisfy downsampled requests.  It will         */
    3775             : /*      return CE_Failure if there are no appropriate overviews         */
    3776             : /*      available but it doesn't emit any error messages.               */
    3777             : /************************************************************************/
    3778             : 
    3779             : //! @cond Doxygen_Suppress
    3780           2 : CPLErr GDALRasterBand::OverviewRasterIO(
    3781             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3782             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3783             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    3784             : 
    3785             : {
    3786             :     GDALRasterIOExtraArg sExtraArg;
    3787           2 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3788             : 
    3789           2 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    3790             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    3791           2 :     if (nOverview < 0)
    3792           1 :         return CE_Failure;
    3793             : 
    3794             :     /* -------------------------------------------------------------------- */
    3795             :     /*      Recast the call in terms of the new raster layer.               */
    3796             :     /* -------------------------------------------------------------------- */
    3797           1 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    3798           1 :     if (poOverviewBand == nullptr)
    3799           0 :         return CE_Failure;
    3800             : 
    3801           1 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    3802             :                                     pData, nBufXSize, nBufYSize, eBufType,
    3803           1 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    3804             : }
    3805             : 
    3806             : /************************************************************************/
    3807             : /*                      TryOverviewRasterIO()                           */
    3808             : /************************************************************************/
    3809             : 
    3810      161938 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    3811             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3812             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3813             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    3814             :     int *pbTried)
    3815             : {
    3816      161938 :     int nXOffMod = nXOff;
    3817      161938 :     int nYOffMod = nYOff;
    3818      161938 :     int nXSizeMod = nXSize;
    3819      161938 :     int nYSizeMod = nYSize;
    3820             :     GDALRasterIOExtraArg sExtraArg;
    3821             : 
    3822      161938 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3823             : 
    3824      161938 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    3825             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    3826             :         &sExtraArg);
    3827             : 
    3828      161938 :     if (iOvrLevel >= 0)
    3829             :     {
    3830          49 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    3831          49 :         if (poOverviewBand)
    3832             :         {
    3833          49 :             *pbTried = TRUE;
    3834          49 :             return poOverviewBand->RasterIO(
    3835             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    3836             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    3837          49 :                 &sExtraArg);
    3838             :         }
    3839             :     }
    3840             : 
    3841      161889 :     *pbTried = FALSE;
    3842      161889 :     return CE_None;
    3843             : }
    3844             : 
    3845             : /************************************************************************/
    3846             : /*                      TryOverviewRasterIO()                           */
    3847             : /************************************************************************/
    3848             : 
    3849      158022 : CPLErr GDALDataset::TryOverviewRasterIO(
    3850             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3851             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3852             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    3853             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    3854             :     int *pbTried)
    3855             : {
    3856      158022 :     int nXOffMod = nXOff;
    3857      158022 :     int nYOffMod = nYOff;
    3858      158022 :     int nXSizeMod = nXSize;
    3859      158022 :     int nYSizeMod = nYSize;
    3860             :     GDALRasterIOExtraArg sExtraArg;
    3861      158022 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3862             : 
    3863      316044 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    3864      158022 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    3865             :         nBufYSize, &sExtraArg);
    3866             : 
    3867      158058 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    3868          36 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    3869             :     {
    3870          36 :         *pbTried = TRUE;
    3871          36 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    3872             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    3873             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    3874          36 :             nLineSpace, nBandSpace, &sExtraArg);
    3875             :     }
    3876             :     else
    3877             :     {
    3878      157986 :         *pbTried = FALSE;
    3879      157986 :         return CE_None;
    3880             :     }
    3881             : }
    3882             : 
    3883             : /************************************************************************/
    3884             : /*                        GetBestOverviewLevel()                        */
    3885             : /*                                                                      */
    3886             : /* Returns the best overview level to satisfy the query or -1 if none   */
    3887             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    3888             : /* overview level                                                       */
    3889             : /************************************************************************/
    3890             : 
    3891           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    3892             :                                            int &nYOff, int &nXSize, int &nYSize,
    3893             :                                            int nBufXSize, int nBufYSize,
    3894             :                                            int nBandCount,
    3895             :                                            const int *panBandMap,
    3896             :                                            GDALRasterIOExtraArg *psExtraArg)
    3897             : {
    3898           4 :     int nOverviewCount = 0;
    3899           4 :     GDALRasterBand *poFirstBand = nullptr;
    3900             : 
    3901             :     /* -------------------------------------------------------------------- */
    3902             :     /* Check that all bands have the same number of overviews and           */
    3903             :     /* that they have all the same size and block dimensions                */
    3904             :     /* -------------------------------------------------------------------- */
    3905          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    3906             :     {
    3907           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    3908           8 :         if (poBand == nullptr)
    3909           0 :             return -1;
    3910           8 :         if (iBand == 0)
    3911             :         {
    3912           4 :             poFirstBand = poBand;
    3913           4 :             nOverviewCount = poBand->GetOverviewCount();
    3914             :         }
    3915           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    3916             :         {
    3917           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    3918             :                              "mismatched overview count, use std method.");
    3919           0 :             return -1;
    3920             :         }
    3921             :         else
    3922             :         {
    3923           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    3924             :             {
    3925           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    3926             :                 GDALRasterBand *poOvrFirstBand =
    3927           0 :                     poFirstBand->GetOverview(iOverview);
    3928           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    3929           0 :                     continue;
    3930             : 
    3931           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    3932           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    3933             :                 {
    3934           0 :                     CPLDebug("GDAL",
    3935             :                              "GDALDataset::GetBestOverviewLevel() ... "
    3936             :                              "mismatched overview sizes, use std method.");
    3937           0 :                     return -1;
    3938             :                 }
    3939           0 :                 int nBlockXSizeFirst = 0;
    3940           0 :                 int nBlockYSizeFirst = 0;
    3941           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    3942             :                                              &nBlockYSizeFirst);
    3943             : 
    3944           0 :                 int nBlockXSizeCurrent = 0;
    3945           0 :                 int nBlockYSizeCurrent = 0;
    3946           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    3947             :                                         &nBlockYSizeCurrent);
    3948             : 
    3949           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    3950           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    3951             :                 {
    3952           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    3953             :                                      "mismatched block sizes, use std method.");
    3954           0 :                     return -1;
    3955             :                 }
    3956             :             }
    3957             :         }
    3958             :     }
    3959           4 :     if (poFirstBand == nullptr)
    3960           0 :         return -1;
    3961             : 
    3962           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    3963             :                                          nYSize, nBufXSize, nBufYSize,
    3964           4 :                                          psExtraArg);
    3965             : }
    3966             : 
    3967             : /************************************************************************/
    3968             : /*                         BlockBasedRasterIO()                         */
    3969             : /*                                                                      */
    3970             : /*      This convenience function implements a dataset level            */
    3971             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    3972             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    3973             : /*      all bands at once, so that a format driver that handles a       */
    3974             : /*      request for different bands of the same block efficiently       */
    3975             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    3976             : /*                                                                      */
    3977             : /*      This method is intended to be called by an overridden           */
    3978             : /*      IRasterIO() method in the driver specific GDALDataset           */
    3979             : /*      derived class.                                                  */
    3980             : /*                                                                      */
    3981             : /*      Default internal implementation of RasterIO() ... utilizes      */
    3982             : /*      the Block access methods to satisfy the request.  This would    */
    3983             : /*      normally only be overridden by formats with overviews.          */
    3984             : /*                                                                      */
    3985             : /*      To keep things relatively simple, this method does not          */
    3986             : /*      currently take advantage of some special cases addressed in     */
    3987             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    3988             : /*      call it when you know it will help.  That is in cases where     */
    3989             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    3990             : /*      implementing interleaved IO efficiently on a block by block     */
    3991             : /*      basis. Overviews will be used when possible.                    */
    3992             : /************************************************************************/
    3993             : 
    3994       63528 : CPLErr GDALDataset::BlockBasedRasterIO(
    3995             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3996             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3997             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    3998             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    3999             : 
    4000             : {
    4001       63528 :     CPLAssert(nullptr != pData);
    4002             : 
    4003       63528 :     GByte **papabySrcBlock = nullptr;
    4004       63528 :     GDALRasterBlock *poBlock = nullptr;
    4005       63528 :     GDALRasterBlock **papoBlocks = nullptr;
    4006       63528 :     int nLBlockX = -1;
    4007       63528 :     int nLBlockY = -1;
    4008             :     int iBufYOff;
    4009             :     int iBufXOff;
    4010       63528 :     int nBlockXSize = 1;
    4011       63528 :     int nBlockYSize = 1;
    4012       63528 :     CPLErr eErr = CE_None;
    4013       63528 :     GDALDataType eDataType = GDT_Byte;
    4014             : 
    4015       63528 :     const bool bUseIntegerRequestCoords =
    4016       63961 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4017         433 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4018         431 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4019             : 
    4020             :     /* -------------------------------------------------------------------- */
    4021             :     /*      Ensure that all bands share a common block size and data type.  */
    4022             :     /* -------------------------------------------------------------------- */
    4023      300722 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4024             :     {
    4025      237194 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4026             : 
    4027      237192 :         if (iBand == 0)
    4028             :         {
    4029       63528 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4030       63528 :             eDataType = poBand->GetRasterDataType();
    4031             :         }
    4032             :         else
    4033             :         {
    4034      173664 :             int nThisBlockXSize = 0;
    4035      173664 :             int nThisBlockYSize = 0;
    4036      173664 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    4037      173667 :             if (nThisBlockXSize != nBlockXSize ||
    4038      173667 :                 nThisBlockYSize != nBlockYSize)
    4039             :             {
    4040           1 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4041             :                                  "mismatched block sizes, use std method.");
    4042           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4043             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4044             :                                          nBandCount, panBandMap, nPixelSpace,
    4045           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4046             :             }
    4047             : 
    4048      173666 :             if (eDataType != poBand->GetRasterDataType() &&
    4049           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    4050             :             {
    4051           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4052             :                                  "mismatched band data types, use std method.");
    4053           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4054             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4055             :                                          nBandCount, panBandMap, nPixelSpace,
    4056           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4057             :             }
    4058             :         }
    4059             :     }
    4060             : 
    4061             :     /* ==================================================================== */
    4062             :     /*      In this special case at full resolution we step through in      */
    4063             :     /*      blocks, turning the request over to the per-band                */
    4064             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4065             :     /*      called before proceeding to the next.                           */
    4066             :     /* ==================================================================== */
    4067             : 
    4068       63528 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4069             :     {
    4070             :         GDALRasterIOExtraArg sDummyExtraArg;
    4071       63524 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4072             : 
    4073       63524 :         int nChunkYSize = 0;
    4074       63524 :         int nChunkXSize = 0;
    4075             : 
    4076      219117 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4077             :         {
    4078      156620 :             const int nChunkYOff = iBufYOff + nYOff;
    4079      156620 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4080      156620 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4081       58858 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4082             : 
    4083      837083 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4084             :             {
    4085      681480 :                 const int nChunkXOff = iBufXOff + nXOff;
    4086      681480 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4087      681480 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4088       74871 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4089             : 
    4090      681480 :                 GByte *pabyChunkData =
    4091      681480 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4092      681480 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4093             : 
    4094     3311550 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4095             :                 {
    4096     2631090 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4097             : 
    4098     5262150 :                     eErr = poBand->IRasterIO(
    4099             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4100             :                         nChunkYSize,
    4101     2631060 :                         pabyChunkData +
    4102     2631060 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4103             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4104     2631060 :                         nLineSpace, &sDummyExtraArg);
    4105     2631080 :                     if (eErr != CE_None)
    4106        1013 :                         return eErr;
    4107             :                 }
    4108             :             }
    4109             : 
    4110      175839 :             if (psExtraArg->pfnProgress != nullptr &&
    4111       20236 :                 !psExtraArg->pfnProgress(
    4112      175839 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4113             :                         nBufYSize,
    4114             :                     "", psExtraArg->pProgressData))
    4115             :             {
    4116           8 :                 return CE_Failure;
    4117             :             }
    4118             :         }
    4119             : 
    4120       62497 :         return CE_None;
    4121             :     }
    4122             : 
    4123             :     /* Below code is not compatible with that case. It would need a complete */
    4124             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    4125           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    4126             :     {
    4127           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4128             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4129             :                                  panBandMap, nPixelSpace, nLineSpace,
    4130           0 :                                  nBandSpace, psExtraArg);
    4131             :     }
    4132             : 
    4133             :     /* We could have a smarter implementation, but that will do for now */
    4134           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    4135           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    4136             :     {
    4137           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4138             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4139             :                                  panBandMap, nPixelSpace, nLineSpace,
    4140           0 :                                  nBandSpace, psExtraArg);
    4141             :     }
    4142             : 
    4143             :     /* ==================================================================== */
    4144             :     /*      Loop reading required source blocks to satisfy output           */
    4145             :     /*      request.  This is the most general implementation.              */
    4146             :     /* ==================================================================== */
    4147             : 
    4148           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    4149             : 
    4150             :     papabySrcBlock =
    4151           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    4152             :     papoBlocks =
    4153           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    4154             : 
    4155             :     /* -------------------------------------------------------------------- */
    4156             :     /*      Select an overview level if appropriate.                        */
    4157             :     /* -------------------------------------------------------------------- */
    4158             : 
    4159             :     GDALRasterIOExtraArg sExtraArg;
    4160           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4161           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    4162             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    4163             :         panBandMap, &sExtraArg);
    4164           4 :     if (nOverviewLevel >= 0)
    4165             :     {
    4166           2 :         GetRasterBand(panBandMap[0])
    4167           2 :             ->GetOverview(nOverviewLevel)
    4168           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4169             :     }
    4170             : 
    4171           4 :     double dfXOff = nXOff;
    4172           4 :     double dfYOff = nYOff;
    4173           4 :     double dfXSize = nXSize;
    4174           4 :     double dfYSize = nYSize;
    4175           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    4176             :     {
    4177           2 :         dfXOff = sExtraArg.dfXOff;
    4178           2 :         dfYOff = sExtraArg.dfYOff;
    4179           2 :         dfXSize = sExtraArg.dfXSize;
    4180           2 :         dfYSize = sExtraArg.dfYSize;
    4181             :     }
    4182             : 
    4183             :     /* -------------------------------------------------------------------- */
    4184             :     /*      Compute stepping increment.                                     */
    4185             :     /* -------------------------------------------------------------------- */
    4186           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    4187           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    4188             : 
    4189           4 :     constexpr double EPS = 1e-10;
    4190             :     /* -------------------------------------------------------------------- */
    4191             :     /*      Loop over buffer computing source locations.                    */
    4192             :     /* -------------------------------------------------------------------- */
    4193          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    4194             :     {
    4195             :         GPtrDiff_t iSrcOffset;
    4196             : 
    4197             :         // Add small epsilon to avoid some numeric precision issues.
    4198          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    4199          32 :         const int iSrcY = static_cast<int>(std::min(
    4200          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    4201             : 
    4202          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    4203             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    4204             : 
    4205         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    4206             :         {
    4207         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    4208         270 :             const int iSrcX = static_cast<int>(std::min(
    4209         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    4210             : 
    4211             :             // FIXME: this code likely doesn't work if the dirty block gets
    4212             :             // flushed to disk before being completely written. In the meantime,
    4213             :             // bJustInitialize should probably be set to FALSE even if it is not
    4214             :             // ideal performance wise, and for lossy compression
    4215             : 
    4216             :             /* --------------------------------------------------------------------
    4217             :              */
    4218             :             /*      Ensure we have the appropriate block loaded. */
    4219             :             /* --------------------------------------------------------------------
    4220             :              */
    4221         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    4222         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    4223         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    4224         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    4225             :             {
    4226           4 :                 nLBlockX = iSrcX / nBlockXSize;
    4227           4 :                 nLBlockY = iSrcY / nBlockYSize;
    4228             : 
    4229           4 :                 const bool bJustInitialize =
    4230           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    4231           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    4232           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    4233           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    4234             :                 /*bool bMemZeroBuffer = FALSE;
    4235             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    4236             :                     nXOff <= nLBlockX * nBlockXSize &&
    4237             :                     nYOff <= nLBlockY * nBlockYSize &&
    4238             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    4239             :                      (nXOff + nXSize == GetRasterXSize() &&
    4240             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    4241             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    4242             :                      (nYOff + nYSize == GetRasterYSize() &&
    4243             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    4244             :                 {
    4245             :                     bJustInitialize = TRUE;
    4246             :                     bMemZeroBuffer = TRUE;
    4247             :                 }*/
    4248          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4249             :                 {
    4250           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4251           8 :                     if (nOverviewLevel >= 0)
    4252           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    4253          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    4254           8 :                                                         bJustInitialize);
    4255           8 :                     if (poBlock == nullptr)
    4256             :                     {
    4257           0 :                         eErr = CE_Failure;
    4258           0 :                         goto CleanupAndReturn;
    4259             :                     }
    4260             : 
    4261           8 :                     if (eRWFlag == GF_Write)
    4262           0 :                         poBlock->MarkDirty();
    4263             : 
    4264           8 :                     if (papoBlocks[iBand] != nullptr)
    4265           0 :                         papoBlocks[iBand]->DropLock();
    4266             : 
    4267           8 :                     papoBlocks[iBand] = poBlock;
    4268             : 
    4269           8 :                     papabySrcBlock[iBand] =
    4270           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    4271             :                     /*if( bMemZeroBuffer )
    4272             :                     {
    4273             :                         memset(papabySrcBlock[iBand], 0,
    4274             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    4275             :                     * nBlockYSize);
    4276             :                     }*/
    4277             :                 }
    4278             :             }
    4279             : 
    4280             :             /* --------------------------------------------------------------------
    4281             :              */
    4282             :             /*      Copy over this pixel of data. */
    4283             :             /* --------------------------------------------------------------------
    4284             :              */
    4285         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    4286         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    4287         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    4288         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    4289         270 :                               nBlockXSize) *
    4290         270 :                          nBandDataSize;
    4291             : 
    4292         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    4293             :             {
    4294         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    4295         710 :                 GPtrDiff_t iBandBufOffset =
    4296         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    4297             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    4298             : 
    4299         710 :                 if (eDataType == eBufType)
    4300             :                 {
    4301         710 :                     if (eRWFlag == GF_Read)
    4302         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    4303         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    4304             :                     else
    4305           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    4306             :                                static_cast<const GByte *>(pData) +
    4307           0 :                                    iBandBufOffset,
    4308             :                                nBandDataSize);
    4309             :                 }
    4310             :                 else
    4311             :                 {
    4312             :                     /* type to type conversion ... ouch, this is expensive way
    4313             :                        of handling single words */
    4314             : 
    4315           0 :                     if (eRWFlag == GF_Read)
    4316           0 :                         GDALCopyWords(pabySrcBlock + iSrcOffset, eDataType, 0,
    4317             :                                       static_cast<GByte *>(pData) +
    4318           0 :                                           iBandBufOffset,
    4319             :                                       eBufType, 0, 1);
    4320             :                     else
    4321           0 :                         GDALCopyWords(static_cast<const GByte *>(pData) +
    4322           0 :                                           iBandBufOffset,
    4323           0 :                                       eBufType, 0, pabySrcBlock + iSrcOffset,
    4324             :                                       eDataType, 0, 1);
    4325             :                 }
    4326             :             }
    4327             : 
    4328         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    4329             :         }
    4330             :     }
    4331             : 
    4332             :     /* -------------------------------------------------------------------- */
    4333             :     /*      CleanupAndReturn.                                               */
    4334             :     /* -------------------------------------------------------------------- */
    4335           4 : CleanupAndReturn:
    4336           4 :     CPLFree(papabySrcBlock);
    4337           4 :     if (papoBlocks != nullptr)
    4338             :     {
    4339          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    4340             :         {
    4341           8 :             if (papoBlocks[iBand] != nullptr)
    4342           8 :                 papoBlocks[iBand]->DropLock();
    4343             :         }
    4344           4 :         CPLFree(papoBlocks);
    4345             :     }
    4346             : 
    4347           4 :     return eErr;
    4348             : }
    4349             : 
    4350             : //! @endcond
    4351             : 
    4352             : /************************************************************************/
    4353             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    4354             : /************************************************************************/
    4355             : 
    4356        2825 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    4357             :                                             GDALRasterBand *poDstPrototypeBand,
    4358             :                                             int nBandCount,
    4359             :                                             int bDstIsCompressed,
    4360             :                                             int bInterleave, int *pnSwathCols,
    4361             :                                             int *pnSwathLines)
    4362             : {
    4363        2825 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4364        2825 :     int nSrcBlockXSize = 0;
    4365        2825 :     int nSrcBlockYSize = 0;
    4366        2825 :     int nBlockXSize = 0;
    4367        2825 :     int nBlockYSize = 0;
    4368             : 
    4369        2825 :     int nXSize = poSrcPrototypeBand->GetXSize();
    4370        2825 :     int nYSize = poSrcPrototypeBand->GetYSize();
    4371             : 
    4372        2825 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    4373        2825 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4374             : 
    4375        2825 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    4376        2825 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    4377             : 
    4378        2825 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4379        2825 :     if (bInterleave)
    4380        1314 :         nPixelSize *= nBandCount;
    4381             : 
    4382             :     // aim for one row of blocks.  Do not settle for less.
    4383        2825 :     int nSwathCols = nXSize;
    4384        2825 :     int nSwathLines = nMaxBlockYSize;
    4385             : 
    4386             :     const char *pszSrcCompression =
    4387        2825 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4388        2825 :     if (pszSrcCompression == nullptr)
    4389             :     {
    4390        2799 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    4391        2799 :         if (poSrcDS)
    4392             :             pszSrcCompression =
    4393        2793 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4394             :     }
    4395             : 
    4396             :     /* -------------------------------------------------------------------- */
    4397             :     /*      What will our swath size be?                                    */
    4398             :     /* -------------------------------------------------------------------- */
    4399             :     // When writing interleaved data in a compressed format, we want to be sure
    4400             :     // that each block will only be written once, so the swath size must not be
    4401             :     // greater than the block cache.
    4402        2825 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    4403             :     int nTargetSwathSize;
    4404        2825 :     if (pszSwathSize != nullptr)
    4405           0 :         nTargetSwathSize = static_cast<int>(
    4406           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    4407             :     else
    4408             :     {
    4409             :         // As a default, take one 1/4 of the cache size.
    4410        2825 :         nTargetSwathSize = static_cast<int>(
    4411        2825 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    4412             : 
    4413             :         // but if the minimum idal swath buf size is less, then go for it to
    4414             :         // avoid unnecessarily abusing RAM usage.
    4415             :         // but try to use 10 MB at least.
    4416        2825 :         GIntBig nIdealSwathBufSize =
    4417        2825 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    4418        2825 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    4419             : 
    4420        2825 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    4421        2825 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    4422             :         {
    4423           2 :             nMinTargetSwathSize = nTargetSwathSize;
    4424             :         }
    4425             : 
    4426        2825 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    4427        2815 :             nIdealSwathBufSize < nMinTargetSwathSize)
    4428             :         {
    4429        2812 :             nIdealSwathBufSize = nMinTargetSwathSize;
    4430             :         }
    4431             : 
    4432        2825 :         if (pszSrcCompression != nullptr &&
    4433         156 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    4434           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    4435           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    4436             :         {
    4437           2 :             nIdealSwathBufSize =
    4438           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    4439           2 :                                                  nSrcBlockYSize * nPixelSize);
    4440             :         }
    4441        2825 :         if (nTargetSwathSize > nIdealSwathBufSize)
    4442        2811 :             nTargetSwathSize = static_cast<int>(
    4443        2811 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    4444             :     }
    4445             : 
    4446        2825 :     if (nTargetSwathSize < 1000000)
    4447           8 :         nTargetSwathSize = 1000000;
    4448             : 
    4449             :     /* But let's check that  */
    4450        3033 :     if (bDstIsCompressed && bInterleave &&
    4451         208 :         nTargetSwathSize > GDALGetCacheMax64())
    4452             :     {
    4453           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4454             :                  "When translating into a compressed interleave format, "
    4455             :                  "the block cache size (" CPL_FRMT_GIB ") "
    4456             :                  "should be at least the size of the swath (%d) "
    4457             :                  "(GDAL_SWATH_SIZE config. option)",
    4458             :                  GDALGetCacheMax64(), nTargetSwathSize);
    4459             :     }
    4460             : 
    4461             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    4462             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    4463             : 
    4464             :     // if both input and output datasets are tiled, that the tile dimensions
    4465             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    4466             :     // of input and output block dimensions.
    4467        2825 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    4468          34 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    4469          34 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    4470          34 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4471          34 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4472             :     {
    4473          34 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    4474          34 :                 nPixelSize <=
    4475          34 :             static_cast<GIntBig>(nTargetSwathSize))
    4476             :         {
    4477          34 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    4478          34 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    4479          34 :             if (nSwathCols == 0)
    4480           0 :                 nSwathCols = nMaxBlockXSize;
    4481          34 :             if (nSwathCols > nXSize)
    4482          32 :                 nSwathCols = nXSize;
    4483          34 :             nSwathLines = nMaxBlockYSize;
    4484             : 
    4485          34 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    4486          34 :                 static_cast<GIntBig>(nTargetSwathSize))
    4487             :             {
    4488           0 :                 nSwathCols = nXSize;
    4489           0 :                 nSwathLines = nBlockYSize;
    4490             :             }
    4491             :         }
    4492             :     }
    4493             : 
    4494        2825 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    4495        2825 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    4496        2825 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    4497             :     {
    4498           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    4499           1 :         if (nSwathLines == 0)
    4500           1 :             nSwathLines = 1;
    4501             : 
    4502           1 :         CPLDebug(
    4503             :             "GDAL",
    4504             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    4505             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    4506             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    4507           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    4508             :     }
    4509             :     // If we are processing single scans, try to handle several at once.
    4510             :     // If we are handling swaths already, only grow the swath if a row
    4511             :     // of blocks is substantially less than our target buffer size.
    4512        2824 :     else if (nSwathLines == 1 ||
    4513        2324 :              nMemoryPerCol * nSwathLines <
    4514        2324 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    4515             :     {
    4516        2797 :         nSwathLines = std::min(
    4517             :             nYSize,
    4518        2797 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    4519             : 
    4520             :         /* If possible try to align to source and target block height */
    4521        2797 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    4522         958 :             nSwathLines > nMaxBlockYSize &&
    4523         958 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4524         930 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4525         148 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    4526             :     }
    4527             : 
    4528        2825 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    4529           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    4530           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    4531             :     {
    4532             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    4533           2 :         if (nSwathLines < nSrcBlockYSize)
    4534             :         {
    4535           0 :             nSwathLines = nSrcBlockYSize;
    4536             : 
    4537             :             // Number of pixels that can be read/write simultaneously.
    4538           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    4539           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    4540           0 :             if (nSwathCols == 0)
    4541           0 :                 nSwathCols = nSrcBlockXSize;
    4542           0 :             if (nSwathCols > nXSize)
    4543           0 :                 nSwathCols = nXSize;
    4544             : 
    4545           0 :             CPLDebug(
    4546             :                 "GDAL",
    4547             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    4548             :                 "too high block, "
    4549             :                 "use partial width at one time");
    4550             :         }
    4551           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    4552             :         {
    4553             :             /* Round on a multiple of nSrcBlockYSize */
    4554           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    4555           0 :             CPLDebug(
    4556             :                 "GDAL",
    4557             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    4558             :                 "round nSwathLines to block height : %d",
    4559             :                 nSwathLines);
    4560             :         }
    4561             :     }
    4562        2823 :     else if (bDstIsCompressed)
    4563             :     {
    4564         365 :         if (nSwathLines < nBlockYSize)
    4565             :         {
    4566         142 :             nSwathLines = nBlockYSize;
    4567             : 
    4568             :             // Number of pixels that can be read/write simultaneously.
    4569         142 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    4570         142 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    4571         142 :             if (nSwathCols == 0)
    4572           0 :                 nSwathCols = nBlockXSize;
    4573         142 :             if (nSwathCols > nXSize)
    4574         142 :                 nSwathCols = nXSize;
    4575             : 
    4576         142 :             CPLDebug(
    4577             :                 "GDAL",
    4578             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    4579             :                 "too high block, "
    4580             :                 "use partial width at one time");
    4581             :         }
    4582         223 :         else if ((nSwathLines % nBlockYSize) != 0)
    4583             :         {
    4584             :             // Round on a multiple of nBlockYSize.
    4585           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    4586           9 :             CPLDebug(
    4587             :                 "GDAL",
    4588             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    4589             :                 "round nSwathLines to block height : %d",
    4590             :                 nSwathLines);
    4591             :         }
    4592             :     }
    4593             : 
    4594        2825 :     *pnSwathCols = nSwathCols;
    4595        2825 :     *pnSwathLines = nSwathLines;
    4596        2825 : }
    4597             : 
    4598             : /************************************************************************/
    4599             : /*                     GDALDatasetCopyWholeRaster()                     */
    4600             : /************************************************************************/
    4601             : 
    4602             : /**
    4603             :  * \brief Copy all dataset raster data.
    4604             :  *
    4605             :  * This function copies the complete raster contents of one dataset to
    4606             :  * another similarly configured dataset.  The source and destination
    4607             :  * dataset must have the same number of bands, and the same width
    4608             :  * and height.  The bands do not have to have the same data type.
    4609             :  *
    4610             :  * This function is primarily intended to support implementation of
    4611             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    4612             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    4613             :  * performing the transfer in a pixel interleaved fashion.
    4614             :  *
    4615             :  * Currently the only papszOptions value supported are :
    4616             :  * <ul>
    4617             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    4618             :  * write access pattern (this does not modify the layout of the destination
    4619             :  * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
    4620             :  * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
    4621             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    4622             :  * (GDAL &gt;= 2.2)</li>
    4623             :  * </ul>
    4624             :  * More options may be supported in the future.
    4625             :  *
    4626             :  * @param hSrcDS the source dataset
    4627             :  * @param hDstDS the destination dataset
    4628             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    4629             :  * @param pfnProgress progress reporting function.
    4630             :  * @param pProgressData callback data for progress function.
    4631             :  *
    4632             :  * @return CE_None on success, or CE_Failure on failure.
    4633             :  */
    4634             : 
    4635        2797 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    4636             :                                               GDALDatasetH hDstDS,
    4637             :                                               CSLConstList papszOptions,
    4638             :                                               GDALProgressFunc pfnProgress,
    4639             :                                               void *pProgressData)
    4640             : 
    4641             : {
    4642        2797 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    4643        2797 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    4644             : 
    4645        2797 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    4646        2797 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    4647             : 
    4648        2797 :     if (pfnProgress == nullptr)
    4649           3 :         pfnProgress = GDALDummyProgress;
    4650             : 
    4651             :     /* -------------------------------------------------------------------- */
    4652             :     /*      Confirm the datasets match in size and band counts.             */
    4653             :     /* -------------------------------------------------------------------- */
    4654        2797 :     const int nXSize = poDstDS->GetRasterXSize();
    4655        2797 :     const int nYSize = poDstDS->GetRasterYSize();
    4656        2797 :     const int nBandCount = poDstDS->GetRasterCount();
    4657             : 
    4658        2797 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    4659        5594 :         poSrcDS->GetRasterYSize() != nYSize ||
    4660        2797 :         poSrcDS->GetRasterCount() != nBandCount)
    4661             :     {
    4662           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    4663             :                  "Input and output dataset sizes or band counts do not\n"
    4664             :                  "match in GDALDatasetCopyWholeRaster()");
    4665           0 :         return CE_Failure;
    4666             :     }
    4667             : 
    4668             :     /* -------------------------------------------------------------------- */
    4669             :     /*      Report preliminary (0) progress.                                */
    4670             :     /* -------------------------------------------------------------------- */
    4671        2797 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    4672             :     {
    4673           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    4674             :                  "User terminated CreateCopy()");
    4675           1 :         return CE_Failure;
    4676             :     }
    4677             : 
    4678             :     /* -------------------------------------------------------------------- */
    4679             :     /*      Get our prototype band, and assume the others are similarly     */
    4680             :     /*      configured.                                                     */
    4681             :     /* -------------------------------------------------------------------- */
    4682        2796 :     if (nBandCount == 0)
    4683           0 :         return CE_None;
    4684             : 
    4685        2796 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    4686        2796 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    4687        2796 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4688             : 
    4689             :     /* -------------------------------------------------------------------- */
    4690             :     /*      Do we want to try and do the operation in a pixel               */
    4691             :     /*      interleaved fashion?                                            */
    4692             :     /* -------------------------------------------------------------------- */
    4693        2796 :     bool bInterleave = false;
    4694             :     const char *pszInterleave =
    4695        2796 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    4696        2796 :     if (pszInterleave != nullptr &&
    4697        1063 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    4698         151 :         bInterleave = true;
    4699             : 
    4700        2796 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    4701        2796 :     if (pszInterleave != nullptr &&
    4702        2022 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    4703        1285 :         bInterleave = true;
    4704             : 
    4705        2796 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    4706        2796 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    4707           5 :         bInterleave = true;
    4708        2791 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    4709           7 :         bInterleave = false;
    4710             :     // attributes is specific to the TileDB driver
    4711        2784 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    4712           4 :         bInterleave = true;
    4713        2780 :     else if (pszInterleave != nullptr)
    4714             :     {
    4715           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    4716             :                  "Unsupported value for option INTERLEAVE");
    4717             :     }
    4718             : 
    4719             :     // If the destination is compressed, we must try to write blocks just once,
    4720             :     // to save disk space (GTiff case for example), and to avoid data loss
    4721             :     // (JPEG compression for example).
    4722        2796 :     bool bDstIsCompressed = false;
    4723             :     const char *pszDstCompressed =
    4724        2796 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    4725        2796 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    4726         348 :         bDstIsCompressed = true;
    4727             : 
    4728             :     /* -------------------------------------------------------------------- */
    4729             :     /*      What will our swath size be?                                    */
    4730             :     /* -------------------------------------------------------------------- */
    4731             : 
    4732        2796 :     int nSwathCols = 0;
    4733        2796 :     int nSwathLines = 0;
    4734        2796 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    4735             :                                     nBandCount, bDstIsCompressed, bInterleave,
    4736             :                                     &nSwathCols, &nSwathLines);
    4737             : 
    4738        2796 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4739        2796 :     if (bInterleave)
    4740        1314 :         nPixelSize *= nBandCount;
    4741             : 
    4742        2796 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    4743        2796 :     if (pSwathBuf == nullptr)
    4744             :     {
    4745           0 :         return CE_Failure;
    4746             :     }
    4747             : 
    4748        2796 :     CPLDebug("GDAL",
    4749             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    4750             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    4751             : 
    4752             :     // Advise the source raster that we are going to read it completely
    4753             :     // Note: this might already have been done by GDALCreateCopy() in the
    4754             :     // likely case this function is indirectly called by it
    4755        2796 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    4756        2796 :                         nullptr, nullptr);
    4757             : 
    4758             :     /* ==================================================================== */
    4759             :     /*      Band oriented (uninterleaved) case.                             */
    4760             :     /* ==================================================================== */
    4761        2796 :     CPLErr eErr = CE_None;
    4762             :     const bool bCheckHoles =
    4763        2796 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    4764             : 
    4765        2796 :     if (!bInterleave)
    4766             :     {
    4767             :         GDALRasterIOExtraArg sExtraArg;
    4768        1482 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    4769        1482 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    4770             : 
    4771        4446 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    4772        1482 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    4773        1482 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    4774        1482 :         GIntBig nBlocksDone = 0;
    4775             : 
    4776        3668 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    4777             :         {
    4778        2186 :             int nBand = iBand + 1;
    4779             : 
    4780        4527 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4781             :             {
    4782        2341 :                 int nThisLines = nSwathLines;
    4783             : 
    4784        2341 :                 if (iY + nThisLines > nYSize)
    4785         271 :                     nThisLines = nYSize - iY;
    4786             : 
    4787        4682 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    4788        2341 :                      iX += nSwathCols)
    4789             :                 {
    4790        2341 :                     int nThisCols = nSwathCols;
    4791             : 
    4792        2341 :                     if (iX + nThisCols > nXSize)
    4793           0 :                         nThisCols = nXSize - iX;
    4794             : 
    4795        2341 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    4796        2341 :                     if (bCheckHoles)
    4797             :                     {
    4798             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    4799         926 :                                       ->GetDataCoverageStatus(
    4800             :                                           iX, iY, nThisCols, nThisLines,
    4801             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    4802             :                     }
    4803        2341 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4804             :                     {
    4805        2337 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    4806        4674 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    4807        2337 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    4808        2337 :                             (nBlocksDone + 0.5) /
    4809        2337 :                                 static_cast<double>(nTotalBlocks),
    4810             :                             pfnProgress, pProgressData);
    4811        2337 :                         if (sExtraArg.pProgressData == nullptr)
    4812        1400 :                             sExtraArg.pfnProgress = nullptr;
    4813             : 
    4814        2337 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    4815             :                                                  nThisLines, pSwathBuf,
    4816             :                                                  nThisCols, nThisLines, eDT, 1,
    4817             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    4818             : 
    4819        2337 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    4820             : 
    4821        2337 :                         if (eErr == CE_None)
    4822        2333 :                             eErr = poDstDS->RasterIO(
    4823             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    4824             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    4825             :                                 &nBand, 0, 0, 0, nullptr);
    4826             :                     }
    4827             : 
    4828        2341 :                     nBlocksDone++;
    4829        4643 :                     if (eErr == CE_None &&
    4830        2302 :                         !pfnProgress(nBlocksDone /
    4831        2302 :                                          static_cast<double>(nTotalBlocks),
    4832             :                                      nullptr, pProgressData))
    4833             :                     {
    4834           3 :                         eErr = CE_Failure;
    4835           3 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    4836             :                                  "User terminated CreateCopy()");
    4837             :                     }
    4838             :                 }
    4839             :             }
    4840             :         }
    4841             :     }
    4842             : 
    4843             :     /* ==================================================================== */
    4844             :     /*      Pixel interleaved case.                                         */
    4845             :     /* ==================================================================== */
    4846             :     else /* if( bInterleave ) */
    4847             :     {
    4848             :         GDALRasterIOExtraArg sExtraArg;
    4849        1314 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    4850        1314 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    4851             : 
    4852        1314 :         const GIntBig nTotalBlocks =
    4853        1314 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    4854        1314 :             DIV_ROUND_UP(nXSize, nSwathCols);
    4855        1314 :         GIntBig nBlocksDone = 0;
    4856             : 
    4857        2836 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4858             :         {
    4859        1522 :             int nThisLines = nSwathLines;
    4860             : 
    4861        1522 :             if (iY + nThisLines > nYSize)
    4862         186 :                 nThisLines = nYSize - iY;
    4863             : 
    4864        3049 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    4865             :             {
    4866        1527 :                 int nThisCols = nSwathCols;
    4867             : 
    4868        1527 :                 if (iX + nThisCols > nXSize)
    4869           3 :                     nThisCols = nXSize - iX;
    4870             : 
    4871        1527 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    4872        1527 :                 if (bCheckHoles)
    4873             :                 {
    4874        1336 :                     nStatus = 0;
    4875        1389 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    4876             :                     {
    4877        1370 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    4878        1370 :                                        ->GetDataCoverageStatus(
    4879             :                                            iX, iY, nThisCols, nThisLines,
    4880             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    4881        1370 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4882        1317 :                             break;
    4883             :                     }
    4884             :                 }
    4885        1527 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4886             :                 {
    4887        1508 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    4888        3016 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    4889        1508 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    4890        1508 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    4891             :                         pfnProgress, pProgressData);
    4892        1508 :                     if (sExtraArg.pProgressData == nullptr)
    4893         288 :                         sExtraArg.pfnProgress = nullptr;
    4894             : 
    4895        1508 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    4896             :                                              nThisLines, pSwathBuf, nThisCols,
    4897             :                                              nThisLines, eDT, nBandCount,
    4898             :                                              nullptr, 0, 0, 0, &sExtraArg);
    4899             : 
    4900        1508 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    4901             : 
    4902        1508 :                     if (eErr == CE_None)
    4903        1507 :                         eErr = poDstDS->RasterIO(
    4904             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    4905             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    4906             :                             0, 0, nullptr);
    4907             :                 }
    4908             : 
    4909        1527 :                 nBlocksDone++;
    4910        3050 :                 if (eErr == CE_None &&
    4911        1523 :                     !pfnProgress(nBlocksDone /
    4912        1523 :                                      static_cast<double>(nTotalBlocks),
    4913             :                                  nullptr, pProgressData))
    4914             :                 {
    4915           1 :                     eErr = CE_Failure;
    4916           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    4917             :                              "User terminated CreateCopy()");
    4918             :                 }
    4919             :             }
    4920             :         }
    4921             :     }
    4922             : 
    4923             :     /* -------------------------------------------------------------------- */
    4924             :     /*      Cleanup                                                         */
    4925             :     /* -------------------------------------------------------------------- */
    4926        2796 :     CPLFree(pSwathBuf);
    4927             : 
    4928        2796 :     return eErr;
    4929             : }
    4930             : 
    4931             : /************************************************************************/
    4932             : /*                     GDALRasterBandCopyWholeRaster()                  */
    4933             : /************************************************************************/
    4934             : 
    4935             : /**
    4936             :  * \brief Copy a whole raster band
    4937             :  *
    4938             :  * This function copies the complete raster contents of one band to
    4939             :  * another similarly configured band.  The source and destination
    4940             :  * bands must have the same width and height.  The bands do not have
    4941             :  * to have the same data type.
    4942             :  *
    4943             :  * It implements efficient copying, in particular "chunking" the copy in
    4944             :  * substantial blocks.
    4945             :  *
    4946             :  * Currently the only papszOptions value supported are :
    4947             :  * <ul>
    4948             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    4949             :  * achieve best compression.</li>
    4950             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    4951             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    4952             :  * </ul>
    4953             :  *
    4954             :  * @param hSrcBand the source band
    4955             :  * @param hDstBand the destination band
    4956             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    4957             :  * @param pfnProgress progress reporting function.
    4958             :  * @param pProgressData callback data for progress function.
    4959             :  *
    4960             :  * @return CE_None on success, or CE_Failure on failure.
    4961             :  */
    4962             : 
    4963          29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    4964             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    4965             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    4966             :     void *pProgressData)
    4967             : 
    4968             : {
    4969          29 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    4970          29 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    4971             : 
    4972          29 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4973          29 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    4974          29 :     CPLErr eErr = CE_None;
    4975             : 
    4976          29 :     if (pfnProgress == nullptr)
    4977          11 :         pfnProgress = GDALDummyProgress;
    4978             : 
    4979             :     /* -------------------------------------------------------------------- */
    4980             :     /*      Confirm the datasets match in size and band counts.             */
    4981             :     /* -------------------------------------------------------------------- */
    4982          29 :     int nXSize = poSrcBand->GetXSize();
    4983          29 :     int nYSize = poSrcBand->GetYSize();
    4984             : 
    4985          29 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    4986             :     {
    4987           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    4988             :                  "Input and output band sizes do not\n"
    4989             :                  "match in GDALRasterBandCopyWholeRaster()");
    4990           0 :         return CE_Failure;
    4991             :     }
    4992             : 
    4993             :     /* -------------------------------------------------------------------- */
    4994             :     /*      Report preliminary (0) progress.                                */
    4995             :     /* -------------------------------------------------------------------- */
    4996          29 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    4997             :     {
    4998           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    4999             :                  "User terminated CreateCopy()");
    5000           0 :         return CE_Failure;
    5001             :     }
    5002             : 
    5003          29 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5004             : 
    5005             :     // If the destination is compressed, we must try to write blocks just once,
    5006             :     // to save disk space (GTiff case for example), and to avoid data loss
    5007             :     // (JPEG compression for example).
    5008          29 :     bool bDstIsCompressed = false;
    5009             :     const char *pszDstCompressed =
    5010          29 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5011          29 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5012          17 :         bDstIsCompressed = true;
    5013             : 
    5014             :     /* -------------------------------------------------------------------- */
    5015             :     /*      What will our swath size be?                                    */
    5016             :     /* -------------------------------------------------------------------- */
    5017             : 
    5018          29 :     int nSwathCols = 0;
    5019          29 :     int nSwathLines = 0;
    5020          29 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5021             :                                     FALSE, &nSwathCols, &nSwathLines);
    5022             : 
    5023          29 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5024             : 
    5025          29 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5026          29 :     if (pSwathBuf == nullptr)
    5027             :     {
    5028           0 :         return CE_Failure;
    5029             :     }
    5030             : 
    5031          29 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5032             :              nSwathCols, nSwathLines);
    5033             : 
    5034             :     const bool bCheckHoles =
    5035          29 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5036             : 
    5037             :     // Advise the source raster that we are going to read it completely
    5038          29 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    5039             : 
    5040             :     /* ==================================================================== */
    5041             :     /*      Band oriented (uninterleaved) case.                             */
    5042             :     /* ==================================================================== */
    5043             : 
    5044          67 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5045             :     {
    5046          38 :         int nThisLines = nSwathLines;
    5047             : 
    5048          38 :         if (iY + nThisLines > nYSize)
    5049           3 :             nThisLines = nYSize - iY;
    5050             : 
    5051          76 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5052             :         {
    5053          38 :             int nThisCols = nSwathCols;
    5054             : 
    5055          38 :             if (iX + nThisCols > nXSize)
    5056           0 :                 nThisCols = nXSize - iX;
    5057             : 
    5058          38 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5059          38 :             if (bCheckHoles)
    5060             :             {
    5061           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5062             :                     iX, iY, nThisCols, nThisLines,
    5063             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5064             :             }
    5065          38 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5066             :             {
    5067          38 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5068             :                                            nThisLines, pSwathBuf, nThisCols,
    5069             :                                            nThisLines, eDT, 0, 0, nullptr);
    5070             : 
    5071          38 :                 if (eErr == CE_None)
    5072          38 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5073             :                                                nThisLines, pSwathBuf, nThisCols,
    5074             :                                                nThisLines, eDT, 0, 0, nullptr);
    5075             :             }
    5076             : 
    5077          76 :             if (eErr == CE_None &&
    5078          38 :                 !pfnProgress((iY + nThisLines) / static_cast<float>(nYSize),
    5079             :                              nullptr, pProgressData))
    5080             :             {
    5081           0 :                 eErr = CE_Failure;
    5082           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5083             :                          "User terminated CreateCopy()");
    5084             :             }
    5085             :         }
    5086             :     }
    5087             : 
    5088             :     /* -------------------------------------------------------------------- */
    5089             :     /*      Cleanup                                                         */
    5090             :     /* -------------------------------------------------------------------- */
    5091          29 :     CPLFree(pSwathBuf);
    5092             : 
    5093          29 :     return eErr;
    5094             : }
    5095             : 
    5096             : /************************************************************************/
    5097             : /*                      GDALCopyRasterIOExtraArg ()                     */
    5098             : /************************************************************************/
    5099             : 
    5100      323291 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5101             :                               GDALRasterIOExtraArg *psSrcArg)
    5102             : {
    5103      323291 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5104      323291 :     if (psSrcArg)
    5105             :     {
    5106      323291 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5107      323291 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5108      323291 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5109      323291 :         psDestArg->bFloatingPointWindowValidity =
    5110      323291 :             psSrcArg->bFloatingPointWindowValidity;
    5111      323291 :         if (psSrcArg->bFloatingPointWindowValidity)
    5112             :         {
    5113        3108 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5114        3108 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5115        3108 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5116        3108 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5117             :         }
    5118             :     }
    5119      323291 : }
    5120             : 
    5121             : /************************************************************************/
    5122             : /*                         HasOnlyNoData()                              */
    5123             : /************************************************************************/
    5124             : 
    5125     4798052 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    5126             : {
    5127     4798052 :     return value == noDataValue;
    5128             : }
    5129             : 
    5130      560287 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    5131             : {
    5132      560287 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5133             : }
    5134             : 
    5135      501120 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    5136             : {
    5137      501120 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5138             : }
    5139             : 
    5140             : template <class T>
    5141        2079 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    5142             :                            size_t nHeight, size_t nLineStride,
    5143             :                            size_t nComponents)
    5144             : {
    5145             :     // Fast test: check the 4 corners and the middle pixel.
    5146        3419 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    5147             :     {
    5148        4895 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    5149        2362 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    5150        2283 :                               noDataValue) &&
    5151        2283 :               IsEqualToNoData(
    5152        2283 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    5153        2283 :                               nComponents +
    5154             :                           iBand],
    5155        1351 :                   noDataValue) &&
    5156        1351 :               IsEqualToNoData(
    5157        1351 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    5158             :                   noDataValue) &&
    5159        1343 :               IsEqualToNoData(
    5160        1343 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    5161        1343 :                               nComponents +
    5162             :                           iBand],
    5163             :                   noDataValue)))
    5164             :         {
    5165        1193 :             return false;
    5166             :         }
    5167             :     }
    5168             : 
    5169             :     // Test all pixels.
    5170       17776 :     for (size_t iY = 0; iY < nHeight; iY++)
    5171             :     {
    5172       16914 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    5173     5866477 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    5174             :         {
    5175     5849583 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    5176             :             {
    5177          24 :                 return false;
    5178             :             }
    5179             :         }
    5180             :     }
    5181         862 :     return true;
    5182             : }
    5183             : 
    5184             : /************************************************************************/
    5185             : /*                    GDALBufferHasOnlyNoData()                         */
    5186             : /************************************************************************/
    5187             : 
    5188       25359 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    5189             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    5190             :                              size_t nComponents, int nBitsPerSample,
    5191             :                              GDALBufferSampleFormat nSampleFormat)
    5192             : {
    5193             :     // In the case where the nodata is 0, we can compare several bytes at
    5194             :     // once. Select the largest natural integer type for the architecture.
    5195             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    5196             :     // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    5197             :     typedef std::uint64_t WordType;
    5198             : #else
    5199             :     typedef std::uint32_t WordType;
    5200             : #endif
    5201       25359 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    5202             :         // Do not use this optimized code path for floating point numbers,
    5203             :         // as it can't detect negative zero.
    5204             :         nSampleFormat != GSF_FLOATING_POINT)
    5205             :     {
    5206       23274 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    5207       23274 :         const size_t nSize =
    5208       23274 :             (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
    5209       23274 :         size_t i = 0;
    5210             :         const size_t nInitialIters =
    5211       46548 :             std::min(sizeof(WordType) -
    5212       23274 :                          static_cast<size_t>(
    5213             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    5214             :                              sizeof(WordType)),
    5215       23274 :                      nSize);
    5216      194907 :         for (; i < nInitialIters; i++)
    5217             :         {
    5218      174929 :             if (pabyBuffer[i])
    5219        3296 :                 return false;
    5220             :         }
    5221    16064100 :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    5222             :         {
    5223    16049600 :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    5224        5446 :                 return false;
    5225             :         }
    5226       51877 :         for (; i < nSize; i++)
    5227             :         {
    5228       37350 :             if (pabyBuffer[i])
    5229           5 :                 return false;
    5230             :         }
    5231       14527 :         return true;
    5232             :     }
    5233             : 
    5234        2085 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    5235             :     {
    5236        2228 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    5237        1114 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    5238        1114 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    5239        1114 :                               nHeight, nLineStride, nComponents);
    5240             :     }
    5241         971 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    5242             :     {
    5243             :         // Use unsigned implementation by converting the nodatavalue to
    5244             :         // unsigned
    5245          63 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    5246          31 :                HasOnlyNoDataT(
    5247             :                    static_cast<const uint8_t *>(pBuffer),
    5248          31 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    5249          32 :                    nWidth, nHeight, nLineStride, nComponents);
    5250             :     }
    5251         939 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    5252             :     {
    5253          21 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    5254          10 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    5255          10 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    5256          11 :                               nHeight, nLineStride, nComponents);
    5257             :     }
    5258         928 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    5259             :     {
    5260             :         // Use unsigned implementation by converting the nodatavalue to
    5261             :         // unsigned
    5262         109 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    5263          54 :                HasOnlyNoDataT(
    5264             :                    static_cast<const uint16_t *>(pBuffer),
    5265          54 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    5266          55 :                    nWidth, nHeight, nLineStride, nComponents);
    5267             :     }
    5268         873 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    5269             :     {
    5270          73 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    5271          36 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    5272             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    5273          37 :                               nHeight, nLineStride, nComponents);
    5274             :     }
    5275         836 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    5276             :     {
    5277             :         // Use unsigned implementation by converting the nodatavalue to
    5278             :         // unsigned
    5279          19 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    5280           9 :                HasOnlyNoDataT(
    5281             :                    static_cast<const uint32_t *>(pBuffer),
    5282           9 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    5283          10 :                    nWidth, nHeight, nLineStride, nComponents);
    5284             :     }
    5285         826 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    5286             :     {
    5287          56 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    5288          28 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    5289             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    5290          28 :                               nHeight, nLineStride, nComponents);
    5291             :     }
    5292         798 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    5293             :     {
    5294             :         // Use unsigned implementation by converting the nodatavalue to
    5295             :         // unsigned
    5296           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    5297           0 :                HasOnlyNoDataT(
    5298             :                    static_cast<const uint64_t *>(pBuffer),
    5299           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    5300           0 :                    nWidth, nHeight, nLineStride, nComponents);
    5301             :     }
    5302         798 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    5303             :     {
    5304        1325 :         return (std::isnan(dfNoDataValue) ||
    5305        1354 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    5306         676 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    5307             :                               static_cast<float>(dfNoDataValue), nWidth,
    5308         677 :                               nHeight, nLineStride, nComponents);
    5309             :     }
    5310         121 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    5311             :     {
    5312         121 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    5313             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    5314         121 :                               nComponents);
    5315             :     }
    5316           0 :     return false;
    5317             : }
    5318             : 
    5319             : #ifdef HAVE_SSE2
    5320             : 
    5321             : /************************************************************************/
    5322             : /*                    GDALDeinterleave3Byte()                           */
    5323             : /************************************************************************/
    5324             : 
    5325             : #if defined(__GNUC__) && !defined(__clang__)
    5326             : __attribute__((optimize("no-tree-vectorize")))
    5327             : #endif
    5328             : static void
    5329       69759 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5330             :                       GByte *CPL_RESTRICT pabyDest0,
    5331             :                       GByte *CPL_RESTRICT pabyDest1,
    5332             :                       GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5333             : #ifdef USE_NEON_OPTIMIZATIONS
    5334             : {
    5335             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5336             :                                        nIters);
    5337             : }
    5338             : #else
    5339             : {
    5340             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5341       69759 :     if (CPLHaveRuntimeSSSE3())
    5342             :     {
    5343       69766 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5344       69767 :                                            pabyDest2, nIters);
    5345             :     }
    5346             : #endif
    5347             : 
    5348           0 :     size_t i = 0;
    5349           0 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    5350           0 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    5351           0 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    5352           0 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    5353             :          sizeof(unsigned int)) == 0)
    5354             :     {
    5355             :         // Slightly better than GCC autovectorizer
    5356          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    5357             :         {
    5358          15 :             unsigned int word0 =
    5359          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    5360          15 :             unsigned int word1 =
    5361          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    5362          15 :             unsigned int word2 =
    5363          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    5364          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    5365          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    5366          15 :                 ((word2 >> 8) << 24);
    5367          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    5368          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    5369          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    5370          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    5371          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    5372          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    5373          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    5374             :         }
    5375             :     }
    5376             : #if defined(__clang__)
    5377             : #pragma clang loop vectorize(disable)
    5378             : #endif
    5379           0 :     for (; i < nIters; ++i)
    5380             :     {
    5381           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    5382           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    5383           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    5384             :     }
    5385             : }
    5386             : #endif
    5387             : 
    5388             : /************************************************************************/
    5389             : /*                    GDALDeinterleave4Byte()                           */
    5390             : /************************************************************************/
    5391             : 
    5392             : #if !defined(__GNUC__) || defined(__clang__)
    5393             : 
    5394             : /************************************************************************/
    5395             : /*                         deinterleave()                               */
    5396             : /************************************************************************/
    5397             : 
    5398             : template <bool SHIFT, bool MASK>
    5399             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    5400             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    5401             : {
    5402             :     // Set higher 24bit of each int32 packed word to 0
    5403             :     if (SHIFT)
    5404             :     {
    5405             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    5406             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    5407             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    5408             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    5409             :     }
    5410             :     __m128i xmm0;
    5411             :     __m128i xmm1;
    5412             :     __m128i xmm2;
    5413             :     __m128i xmm3;
    5414             :     if (MASK)
    5415             :     {
    5416             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    5417             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    5418             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    5419             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    5420             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    5421             :     }
    5422             :     else
    5423             :     {
    5424             :         xmm0 = xmm0_ori;
    5425             :         xmm1 = xmm1_ori;
    5426             :         xmm2 = xmm2_ori;
    5427             :         xmm3 = xmm3_ori;
    5428             :     }
    5429             :     // Pack int32 to int16
    5430             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    5431             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    5432             :     // Pack int16 to uint8
    5433             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    5434             :     return xmm0;
    5435             : }
    5436             : 
    5437             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5438             :                                   GByte *CPL_RESTRICT pabyDest0,
    5439             :                                   GByte *CPL_RESTRICT pabyDest1,
    5440             :                                   GByte *CPL_RESTRICT pabyDest2,
    5441             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5442             : #ifdef USE_NEON_OPTIMIZATIONS
    5443             : {
    5444             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5445             :                                        pabyDest3, nIters);
    5446             : }
    5447             : #else
    5448             : {
    5449             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5450             :     if (CPLHaveRuntimeSSSE3())
    5451             :     {
    5452             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5453             :                                            pabyDest2, pabyDest3, nIters);
    5454             :     }
    5455             : #endif
    5456             : 
    5457             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    5458             :     // do something slightly better.
    5459             :     size_t i = 0;
    5460             :     for (; i + 15 < nIters; i += 16)
    5461             :     {
    5462             :         __m128i xmm0_ori = _mm_loadu_si128(
    5463             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    5464             :         __m128i xmm1_ori = _mm_loadu_si128(
    5465             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    5466             :         __m128i xmm2_ori = _mm_loadu_si128(
    5467             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    5468             :         __m128i xmm3_ori = _mm_loadu_si128(
    5469             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    5470             : 
    5471             :         _mm_storeu_si128(
    5472             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    5473             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5474             :         _mm_storeu_si128(
    5475             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    5476             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5477             :         _mm_storeu_si128(
    5478             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    5479             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5480             :         _mm_storeu_si128(
    5481             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    5482             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5483             :     }
    5484             : 
    5485             : #if defined(__clang__)
    5486             : #pragma clang loop vectorize(disable)
    5487             : #endif
    5488             :     for (; i < nIters; ++i)
    5489             :     {
    5490             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5491             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5492             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5493             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5494             :     }
    5495             : }
    5496             : #endif
    5497             : #else
    5498             : // GCC autovectorizer does an excellent job
    5499       52670 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    5500             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    5501             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    5502             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5503             : {
    5504   525866000 :     for (size_t i = 0; i < nIters; ++i)
    5505             :     {
    5506   525813000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    5507   525813000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    5508   525813000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    5509   525813000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    5510             :     }
    5511       52670 : }
    5512             : #endif
    5513             : 
    5514             : #else
    5515             : 
    5516             : /************************************************************************/
    5517             : /*                    GDALDeinterleave3Byte()                           */
    5518             : /************************************************************************/
    5519             : 
    5520             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    5521             : // how to auto-vectorize
    5522             : // #if defined(__GNUC__)
    5523             : //__attribute__((optimize("tree-vectorize")))
    5524             : // #endif
    5525             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5526             :                                   GByte *CPL_RESTRICT pabyDest0,
    5527             :                                   GByte *CPL_RESTRICT pabyDest1,
    5528             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5529             : {
    5530             :     for (size_t i = 0; i < nIters; ++i)
    5531             :     {
    5532             :         pabyDest0[i] = pabySrc[3 * i + 0];
    5533             :         pabyDest1[i] = pabySrc[3 * i + 1];
    5534             :         pabyDest2[i] = pabySrc[3 * i + 2];
    5535             :     }
    5536             : }
    5537             : 
    5538             : /************************************************************************/
    5539             : /*                    GDALDeinterleave4Byte()                           */
    5540             : /************************************************************************/
    5541             : 
    5542             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    5543             : // how to auto-vectorize
    5544             : // #if defined(__GNUC__)
    5545             : //__attribute__((optimize("tree-vectorize")))
    5546             : // #endif
    5547             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5548             :                                   GByte *CPL_RESTRICT pabyDest0,
    5549             :                                   GByte *CPL_RESTRICT pabyDest1,
    5550             :                                   GByte *CPL_RESTRICT pabyDest2,
    5551             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5552             : {
    5553             :     for (size_t i = 0; i < nIters; ++i)
    5554             :     {
    5555             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5556             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5557             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5558             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5559             :     }
    5560             : }
    5561             : 
    5562             : #endif
    5563             : 
    5564             : /************************************************************************/
    5565             : /*                      GDALDeinterleave()                              */
    5566             : /************************************************************************/
    5567             : 
    5568             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    5569             :     buffers.
    5570             : 
    5571             :     In pseudo-code
    5572             :     \verbatim
    5573             :     for(size_t i = 0; i < nIters; ++i)
    5574             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    5575             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    5576             :     \endverbatim
    5577             : 
    5578             :     The implementation is optimized for a few cases, like de-interleaving
    5579             :     of 3 or 4-components Byte buffers.
    5580             : 
    5581             :     \since GDAL 3.6
    5582             :  */
    5583      123191 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    5584             :                       int nComponents, void **ppDestBuffer,
    5585             :                       GDALDataType eDestDT, size_t nIters)
    5586             : {
    5587      123191 :     if (eSourceDT == eDestDT)
    5588             :     {
    5589      123168 :         if (eSourceDT == GDT_Byte)
    5590             :         {
    5591      122426 :             if (nComponents == 3)
    5592             :             {
    5593       69756 :                 const GByte *CPL_RESTRICT pabySrc =
    5594             :                     static_cast<const GByte *>(pSourceBuffer);
    5595       69756 :                 GByte *CPL_RESTRICT pabyDest0 =
    5596             :                     static_cast<GByte *>(ppDestBuffer[0]);
    5597       69756 :                 GByte *CPL_RESTRICT pabyDest1 =
    5598             :                     static_cast<GByte *>(ppDestBuffer[1]);
    5599       69756 :                 GByte *CPL_RESTRICT pabyDest2 =
    5600             :                     static_cast<GByte *>(ppDestBuffer[2]);
    5601       69756 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5602             :                                       nIters);
    5603       69770 :                 return;
    5604             :             }
    5605       52670 :             else if (nComponents == 4)
    5606             :             {
    5607       52670 :                 const GByte *CPL_RESTRICT pabySrc =
    5608             :                     static_cast<const GByte *>(pSourceBuffer);
    5609       52670 :                 GByte *CPL_RESTRICT pabyDest0 =
    5610             :                     static_cast<GByte *>(ppDestBuffer[0]);
    5611       52670 :                 GByte *CPL_RESTRICT pabyDest1 =
    5612             :                     static_cast<GByte *>(ppDestBuffer[1]);
    5613       52670 :                 GByte *CPL_RESTRICT pabyDest2 =
    5614             :                     static_cast<GByte *>(ppDestBuffer[2]);
    5615       52670 :                 GByte *CPL_RESTRICT pabyDest3 =
    5616             :                     static_cast<GByte *>(ppDestBuffer[3]);
    5617       52670 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5618             :                                       pabyDest3, nIters);
    5619       52670 :                 return;
    5620             :             }
    5621             :         }
    5622             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    5623             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    5624             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    5625        1475 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    5626         742 :                  CPLHaveRuntimeSSSE3())
    5627             :         {
    5628         733 :             if (nComponents == 3)
    5629             :             {
    5630         239 :                 const GUInt16 *CPL_RESTRICT panSrc =
    5631             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    5632         239 :                 GUInt16 *CPL_RESTRICT panDest0 =
    5633             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    5634         239 :                 GUInt16 *CPL_RESTRICT panDest1 =
    5635             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    5636         239 :                 GUInt16 *CPL_RESTRICT panDest2 =
    5637             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    5638         239 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    5639             :                                               panDest2, nIters);
    5640         239 :                 return;
    5641             :             }
    5642             : #if !defined(__INTEL_CLANG_COMPILER)
    5643             :             // ICC autovectorizer doesn't do a good job, at least with icx
    5644             :             // 2022.1.0.20220316
    5645         494 :             else if (nComponents == 4)
    5646             :             {
    5647         494 :                 const GUInt16 *CPL_RESTRICT panSrc =
    5648             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    5649         494 :                 GUInt16 *CPL_RESTRICT panDest0 =
    5650             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    5651         494 :                 GUInt16 *CPL_RESTRICT panDest1 =
    5652             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    5653         494 :                 GUInt16 *CPL_RESTRICT panDest2 =
    5654             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    5655         494 :                 GUInt16 *CPL_RESTRICT panDest3 =
    5656             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    5657         494 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    5658             :                                               panDest2, panDest3, nIters);
    5659         494 :                 return;
    5660             :             }
    5661             : #endif
    5662             :         }
    5663             : #endif
    5664             :     }
    5665             : 
    5666          23 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    5667          22 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    5668          87 :     for (int iComp = 0; iComp < nComponents; iComp++)
    5669             :     {
    5670          65 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    5671          65 :                             iComp * nSourceDTSize,
    5672             :                         eSourceDT, nComponents * nSourceDTSize,
    5673          65 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    5674             :     }
    5675             : }

Generated by: LCOV version 1.14