LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2147 2371 90.6 %
Date: 2024-05-03 15:49:35 Functions: 338 371 91.1 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * Permission is hereby granted, free of charge, to any person obtaining a
      13             :  * copy of this software and associated documentation files (the "Software"),
      14             :  * to deal in the Software without restriction, including without limitation
      15             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      16             :  * and/or sell copies of the Software, and to permit persons to whom the
      17             :  * Software is furnished to do so, subject to the following conditions:
      18             :  *
      19             :  * The above copyright notice and this permission notice shall be included
      20             :  * in all copies or substantial portions of the Software.
      21             :  *
      22             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      23             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      24             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
      25             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      26             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      27             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      28             :  * DEALINGS IN THE SOFTWARE.
      29             :  ****************************************************************************/
      30             : 
      31             : #include "cpl_port.h"
      32             : #include "gdal.h"
      33             : #include "gdal_priv.h"
      34             : 
      35             : #include <cassert>
      36             : #include <climits>
      37             : #include <cmath>
      38             : #include <cstddef>
      39             : #include <cstdio>
      40             : #include <cstdlib>
      41             : #include <cstring>
      42             : 
      43             : #include <algorithm>
      44             : #include <limits>
      45             : #include <stdexcept>
      46             : #include <type_traits>
      47             : 
      48             : #include "cpl_conv.h"
      49             : #include "cpl_cpu_features.h"
      50             : #include "cpl_error.h"
      51             : #include "cpl_progress.h"
      52             : #include "cpl_string.h"
      53             : #include "cpl_vsi.h"
      54             : #include "gdal_priv_templates.hpp"
      55             : #include "gdal_vrt.h"
      56             : #include "gdalwarper.h"
      57             : #include "memdataset.h"
      58             : #include "vrtdataset.h"
      59             : 
      60             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      61             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      62             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      63             : 
      64             : /************************************************************************/
      65             : /*                    DownsamplingIntegerXFactor()                      */
      66             : /************************************************************************/
      67             : 
      68             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      69      413154 : static bool DownsamplingIntegerXFactor(
      70             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      71             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      72             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      73             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      74             : {
      75      413154 :     const int nBandDataSize =
      76             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      77      413154 :     int nOuterLoopIters = nBufXSize - 1;
      78      413154 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      79             :     const GByte *CPL_RESTRICT pabySrcData;
      80      413154 :     int nEndBlockX = nBlockXSize + nStartBlockX;
      81             : 
      82      413154 :     if (iSrcX < nEndBlockX)
      83             :     {
      84      226125 :         CPLAssert(poBlock);
      85      226125 :         goto no_reload_block;
      86             :     }
      87      187029 :     goto reload_block;
      88             : 
      89             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
      90             :     // nRasterXSize - 1
      91      932720 :     while (--nOuterLoopIters >= 1)
      92             :     {
      93      189034 :         iSrcX += nSrcXInc;
      94      189034 :         pabySrcData += nIncSrcOffset;
      95      189034 :         pabyDstData += nPixelSpace;
      96             : 
      97             :         /* --------------------------------------------------------------------
      98             :          */
      99             :         /*      Ensure we have the appropriate block loaded. */
     100             :         /* --------------------------------------------------------------------
     101             :          */
     102      189034 :         if (iSrcX >= nEndBlockX)
     103             :         {
     104      189034 :         reload_block:
     105             :         {
     106      388653 :             const int nLBlockX = iSrcX / nBlockXSize;
     107      388653 :             nStartBlockX = nLBlockX * nBlockXSize;
     108      388653 :             nEndBlockX = nStartBlockX + nBlockXSize;
     109             : 
     110      388653 :             if (poBlock != nullptr)
     111      316698 :                 poBlock->DropLock();
     112             : 
     113      388653 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     114      388653 :             if (poBlock == nullptr)
     115             :             {
     116           1 :                 return false;
     117             :             }
     118             :         }
     119             : 
     120      388652 :         no_reload_block:
     121             :             const GByte *pabySrcBlock =
     122      932720 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     123      932720 :             GPtrDiff_t iSrcOffset =
     124      932720 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     125      932720 :             pabySrcData = pabySrcBlock + iSrcOffset;
     126             :         }
     127             : 
     128             :         /* --------------------------------------------------------------------
     129             :          */
     130             :         /*      Copy the maximum run of pixels. */
     131             :         /* --------------------------------------------------------------------
     132             :          */
     133             : 
     134      932720 :         const int nIters = std::min(
     135      932720 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     136             :         if (bSameDataType)
     137             :         {
     138      932303 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     139      932303 :             if (nIters > 1)
     140             :             {
     141             :                 if (DATA_TYPE_SIZE == 1)
     142             :                 {
     143      276277 :                     pabySrcData += nIncSrcOffset;
     144      276277 :                     pabyDstData += nPixelSpace;
     145      276277 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     146      276277 :                                      nPixelSpace, nIters - 1);
     147      276277 :                     pabySrcData +=
     148      276277 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     149      276277 :                     pabyDstData +=
     150      276277 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     151             :                 }
     152             :                 else
     153             :                 {
     154     4443048 :                     for (int i = 0; i < nIters - 1; i++)
     155             :                     {
     156     4244524 :                         pabySrcData += nIncSrcOffset;
     157     4244524 :                         pabyDstData += nPixelSpace;
     158     4244524 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     159             :                     }
     160             :                 }
     161      474803 :                 iSrcX += nSrcXInc * (nIters - 1);
     162      474803 :                 nOuterLoopIters -= nIters - 1;
     163             :             }
     164             :         }
     165             :         else
     166             :         {
     167             :             // Type to type conversion ...
     168         417 :             GDALCopyWords(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     169         417 :                           eBufType, nPixelSpace, std::max(1, nIters));
     170         417 :             if (nIters > 1)
     171             :             {
     172         206 :                 pabySrcData +=
     173         206 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     174         206 :                 pabyDstData +=
     175         206 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     176         206 :                 iSrcX += nSrcXInc * (nIters - 1);
     177         206 :                 nOuterLoopIters -= nIters - 1;
     178             :             }
     179             :         }
     180             :     }
     181             : 
     182             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     183      743686 :     if (nOuterLoopIters == 0)
     184             :     {
     185      330533 :         const int nRasterXSize = poBand->GetXSize();
     186      330533 :         iSrcX =
     187      661066 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     188      330533 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     189      330533 :         pabyDstData += nPixelSpace;
     190      330533 :         if (iSrcX < nEndBlockX)
     191             :         {
     192      317943 :             goto no_reload_block;
     193             :         }
     194       12590 :         goto reload_block;
     195             :     }
     196      413153 :     return true;
     197             : }
     198             : 
     199             : /************************************************************************/
     200             : /*                             IRasterIO()                              */
     201             : /*                                                                      */
     202             : /*      Default internal implementation of RasterIO() ... utilizes      */
     203             : /*      the Block access methods to satisfy the request.  This would    */
     204             : /*      normally only be overridden by formats with overviews.          */
     205             : /************************************************************************/
     206             : 
     207     5370250 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     208             :                                  int nXSize, int nYSize, void *pData,
     209             :                                  int nBufXSize, int nBufYSize,
     210             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     211             :                                  GSpacing nLineSpace,
     212             :                                  GDALRasterIOExtraArg *psExtraArg)
     213             : 
     214             : {
     215     5370250 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     216             :     {
     217           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     218             :                  "An error occurred while writing a dirty block "
     219             :                  "from GDALRasterBand::IRasterIO");
     220           0 :         CPLErr eErr = eFlushBlockErr;
     221           0 :         eFlushBlockErr = CE_None;
     222           0 :         return eErr;
     223             :     }
     224     5370250 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     225             :     {
     226           0 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     227           0 :         return CE_Failure;
     228             :     }
     229             : 
     230     5370260 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     231     5370260 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     232     5370260 :     GByte dummyBlock[2] = {0, 0};
     233     5370260 :     GByte *pabySrcBlock =
     234             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     235     5370260 :     GDALRasterBlock *poBlock = nullptr;
     236     5370260 :     const bool bUseIntegerRequestCoords =
     237     5410570 :         (!psExtraArg->bFloatingPointWindowValidity ||
     238       40304 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     239       15798 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     240             : 
     241             :     /* ==================================================================== */
     242             :     /*      A common case is the data requested with the destination        */
     243             :     /*      is packed, and the block width is the raster width.             */
     244             :     /* ==================================================================== */
     245     5290230 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     246     2644570 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     247    10660500 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     248             :     {
     249     2539540 :         CPLErr eErr = CE_None;
     250     2539540 :         int nLBlockY = -1;
     251             : 
     252     6908740 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     253             :         {
     254     4370250 :             const int iSrcY = iBufYOff + nYOff;
     255             : 
     256     4370250 :             if (iSrcY < nLBlockY * nBlockYSize ||
     257     4370250 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     258             :             {
     259     2775810 :                 nLBlockY = iSrcY / nBlockYSize;
     260     2775810 :                 bool bJustInitialize =
     261       83668 :                     eRWFlag == GF_Write && nXOff == 0 &&
     262     2907720 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     263       48245 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     264             : 
     265             :                 // Is this a partial tile at right and/or bottom edges of
     266             :                 // the raster, and that is going to be completely written?
     267             :                 // If so, do not load it from storage, but zero it so that
     268             :                 // the content outsize of the validity area is initialized.
     269     2775810 :                 bool bMemZeroBuffer = false;
     270       83668 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     271       20819 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     272     2859570 :                     nYOff + nYSize == GetYSize() &&
     273          86 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     274             :                 {
     275          86 :                     bJustInitialize = true;
     276          86 :                     bMemZeroBuffer = true;
     277             :                 }
     278             : 
     279     2775810 :                 if (poBlock)
     280      236279 :                     poBlock->DropLock();
     281             : 
     282     2775810 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     283     2775820 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     284     2775840 :                 if (poBlock == nullptr)
     285             :                 {
     286        1064 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     287             :                         nullptr)
     288             :                     {
     289           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     290             :                                  "GetBlockRef failed at X block offset %d, "
     291             :                                  "Y block offset %d%s",
     292             :                                  0, nLBlockY,
     293           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     294           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     295             :                                      : "");
     296             :                     }
     297        1064 :                     eErr = CE_Failure;
     298        1064 :                     break;
     299             :                 }
     300             : 
     301     2774770 :                 if (eRWFlag == GF_Write)
     302       83668 :                     poBlock->MarkDirty();
     303             : 
     304     2774770 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     305     2774780 :                 if (bMemZeroBuffer)
     306             :                 {
     307          86 :                     memset(pabySrcBlock, 0,
     308          86 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     309          86 :                                nBlockXSize * nBlockYSize);
     310             :                 }
     311             :             }
     312             : 
     313     4369210 :             const auto nSrcByteOffset =
     314     4369210 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     315     4369210 :                      nBlockXSize +
     316     4369210 :                  nXOff) *
     317     4369210 :                 nBandDataSize;
     318             : 
     319     4369210 :             if (eDataType == eBufType)
     320             :             {
     321     1594200 :                 if (eRWFlag == GF_Read)
     322     1375040 :                     memcpy(static_cast<GByte *>(pData) +
     323     1375040 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     324     1375040 :                            pabySrcBlock + nSrcByteOffset,
     325             :                            static_cast<size_t>(nLineSpace));
     326             :                 else
     327      219160 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     328      219160 :                            static_cast<GByte *>(pData) +
     329      219160 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     330             :                            static_cast<size_t>(nLineSpace));
     331             :             }
     332             :             else
     333             :             {
     334             :                 // Type to type conversion.
     335             : 
     336     2775010 :                 if (eRWFlag == GF_Read)
     337     2757880 :                     GDALCopyWords(
     338     2757880 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     339             :                         static_cast<GByte *>(pData) +
     340     2757880 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     341             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     342             :                 else
     343       17135 :                     GDALCopyWords(static_cast<GByte *>(pData) +
     344       17135 :                                       static_cast<GPtrDiff_t>(iBufYOff) *
     345             :                                           nLineSpace,
     346             :                                   eBufType, static_cast<int>(nPixelSpace),
     347       17135 :                                   pabySrcBlock + nSrcByteOffset, eDataType,
     348             :                                   nBandDataSize, nBufXSize);
     349             :             }
     350             : 
     351     4398730 :             if (psExtraArg->pfnProgress != nullptr &&
     352       29523 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     353             :                                          psExtraArg->pProgressData))
     354             :             {
     355           5 :                 eErr = CE_Failure;
     356           5 :                 break;
     357             :             }
     358             :         }
     359             : 
     360     2539560 :         if (poBlock)
     361     2538500 :             poBlock->DropLock();
     362             : 
     363     2539560 :         return eErr;
     364             :     }
     365             : 
     366             :     /* ==================================================================== */
     367             :     /*      Do we have overviews that would be appropriate to satisfy       */
     368             :     /*      this request?                                                   */
     369             :     /* ==================================================================== */
     370     2830720 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     371             :         eRWFlag == GF_Read)
     372             :     {
     373             :         GDALRasterIOExtraArg sExtraArg;
     374        3330 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     375             : 
     376             :         const int nOverview =
     377        3330 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     378             :                                           nBufXSize, nBufYSize, &sExtraArg);
     379        3330 :         if (nOverview >= 0)
     380             :         {
     381        3304 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     382        3304 :             if (poOverviewBand == nullptr)
     383        3304 :                 return CE_Failure;
     384             : 
     385        3304 :             return poOverviewBand->RasterIO(
     386             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     387        3304 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     388             :         }
     389             :     }
     390             : 
     391      638801 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     392           0 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     393     3466210 :         nLineSpace == nPixelSpace * nBufXSize &&
     394           0 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     395             :     {
     396           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     397           0 :         return CE_None;
     398             :     }
     399             : 
     400             :     /* ==================================================================== */
     401             :     /*      The second case when we don't need subsample data but likely    */
     402             :     /*      need data type conversion.                                      */
     403             :     /* ==================================================================== */
     404     2827410 :     if (  // nPixelSpace == nBufDataSize &&
     405     2827410 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     406             :     {
     407             : #if DEBUG_VERBOSE
     408             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     409             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     410             : #endif
     411             : 
     412             :         /* --------------------------------------------------------------------
     413             :          */
     414             :         /*      Loop over buffer computing source locations. */
     415             :         /* --------------------------------------------------------------------
     416             :          */
     417             :         // Calculate starting values out of loop
     418     2463920 :         const int nLBlockXStart = nXOff / nBlockXSize;
     419     2463920 :         const int nXSpanEnd = nBufXSize + nXOff;
     420             : 
     421     2463920 :         int nYInc = 0;
     422     4955080 :         for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
     423     2491150 :              iBufYOff += nYInc, iSrcY += nYInc)
     424             :         {
     425     2491220 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     426             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     427     2491220 :             int nLBlockY = iSrcY / nBlockYSize;
     428     2491220 :             int nLBlockX = nLBlockXStart;
     429     2491220 :             int iSrcX = nXOff;
     430     5186000 :             while (iSrcX < nXSpanEnd)
     431             :             {
     432     2694840 :                 int nXSpan = nLBlockX * nBlockXSize;
     433     2694840 :                 if (nXSpan < INT_MAX - nBlockXSize)
     434     2694840 :                     nXSpan += nBlockXSize;
     435             :                 else
     436           1 :                     nXSpan = INT_MAX;
     437     2694840 :                 const int nXRight = nXSpan;
     438     2694840 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     439     2694840 :                 const size_t nXSpanSize =
     440     2694840 :                     nXSpan * static_cast<size_t>(nPixelSpace);
     441             : 
     442     2694840 :                 bool bJustInitialize =
     443     2040120 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     444       37043 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     445     4760480 :                     nXOff <= nLBlockX * nBlockXSize &&
     446       25526 :                     nXOff + nXSize >= nXRight;
     447             : 
     448             :                 // Is this a partial tile at right and/or bottom edges of
     449             :                 // the raster, and that is going to be completely written?
     450             :                 // If so, do not load it from storage, but zero it so that
     451             :                 // the content outsize of the validity area is initialized.
     452     2694840 :                 bool bMemZeroBuffer = false;
     453     2040120 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     454     2015840 :                     nXOff <= nLBlockX * nBlockXSize &&
     455     2014220 :                     nYOff <= nLBlockY * nBlockYSize &&
     456       12014 :                     (nXOff + nXSize >= nXRight ||
     457     4737620 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     458       11837 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     459       10597 :                      (nYOff + nYSize == GetYSize() &&
     460        1839 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     461             :                 {
     462        3079 :                     bJustInitialize = true;
     463        3079 :                     bMemZeroBuffer = true;
     464             :                 }
     465             : 
     466             :                 /* --------------------------------------------------------------------
     467             :                  */
     468             :                 /*      Ensure we have the appropriate block loaded. */
     469             :                 /* --------------------------------------------------------------------
     470             :                  */
     471     2694840 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     472     2694840 :                 poBlock =
     473     2694840 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     474     2694840 :                 if (!poBlock)
     475             :                 {
     476          70 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     477             :                         nullptr)
     478             :                     {
     479           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     480             :                                  "GetBlockRef failed at X block offset %d, "
     481             :                                  "Y block offset %d%s",
     482             :                                  nLBlockX, nLBlockY,
     483           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     484           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     485             :                                      : "");
     486             :                     }
     487          70 :                     return (CE_Failure);
     488             :                 }
     489             : 
     490     2694770 :                 if (eRWFlag == GF_Write)
     491     2040120 :                     poBlock->MarkDirty();
     492             : 
     493     2694770 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     494     2694770 :                 if (bMemZeroBuffer)
     495             :                 {
     496        3079 :                     memset(pabySrcBlock, 0,
     497        3079 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     498        3079 :                                nBlockXSize * nBlockYSize);
     499             :                 }
     500             :                 /* --------------------------------------------------------------------
     501             :                  */
     502             :                 /*      Copy over this chunk of data. */
     503             :                 /* --------------------------------------------------------------------
     504             :                  */
     505     2694770 :                 GPtrDiff_t iSrcOffset =
     506     2694770 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     507     2694770 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     508     2694770 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     509     2694770 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     510     2694770 :                          nBlockXSize) *
     511     2694770 :                     nBandDataSize;
     512             :                 // Fill up as many rows as possible for the loaded block.
     513     5389540 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     514     2694770 :                                           nBufYSize - iBufYOff);
     515    56709400 :                 for (int k = 0; k < kmax; k++)
     516             :                 {
     517    54014600 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     518             :                     {
     519    50130800 :                         if (eRWFlag == GF_Read)
     520    45724300 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     521    45724300 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     522    45724300 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     523             :                         else
     524     4406500 :                             memcpy(pabySrcBlock + iSrcOffset,
     525     4406500 :                                    static_cast<GByte *>(pData) + iBufOffset +
     526     4406500 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     527             :                                    nXSpanSize);
     528             :                     }
     529             :                     else
     530             :                     {
     531             :                         /* type to type conversion */
     532     3883750 :                         if (eRWFlag == GF_Read)
     533     3824760 :                             GDALCopyWords(
     534     3824760 :                                 pabySrcBlock + iSrcOffset, eDataType,
     535             :                                 nBandDataSize,
     536     3824760 :                                 static_cast<GByte *>(pData) + iBufOffset +
     537     3824760 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     538             :                                 eBufType, static_cast<int>(nPixelSpace),
     539             :                                 nXSpan);
     540             :                         else
     541       58992 :                             GDALCopyWords(
     542       58992 :                                 static_cast<GByte *>(pData) + iBufOffset +
     543       58992 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     544             :                                 eBufType, static_cast<int>(nPixelSpace),
     545       58992 :                                 pabySrcBlock + iSrcOffset, eDataType,
     546             :                                 nBandDataSize, nXSpan);
     547             :                     }
     548             : 
     549    54014600 :                     iSrcOffset +=
     550    54014600 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     551             :                 }
     552             : 
     553             :                 iBufOffset =
     554     2694770 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     555     2694770 :                 nLBlockX++;
     556     2694770 :                 iSrcX += nXSpan;
     557             : 
     558     2694770 :                 poBlock->DropLock();
     559     2694770 :                 poBlock = nullptr;
     560             :             }
     561             : 
     562             :             /* Compute the increment to go on a block boundary */
     563     2491160 :             nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     564             : 
     565     2491220 :             if (psExtraArg->pfnProgress != nullptr &&
     566          66 :                 !psExtraArg->pfnProgress(
     567     2491220 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     568             :                     psExtraArg->pProgressData))
     569             :             {
     570           0 :                 return CE_Failure;
     571             :             }
     572             :         }
     573             : 
     574     2463850 :         return CE_None;
     575             :     }
     576             : 
     577             :     /* ==================================================================== */
     578             :     /*      Loop reading required source blocks to satisfy output           */
     579             :     /*      request.  This is the most general implementation.              */
     580             :     /* ==================================================================== */
     581             : 
     582      363491 :     double dfXOff = nXOff;
     583      363491 :     double dfYOff = nYOff;
     584      363491 :     double dfXSize = nXSize;
     585      363491 :     double dfYSize = nYSize;
     586      363491 :     if (psExtraArg->bFloatingPointWindowValidity)
     587             :     {
     588       28888 :         dfXOff = psExtraArg->dfXOff;
     589       28888 :         dfYOff = psExtraArg->dfYOff;
     590       28888 :         dfXSize = psExtraArg->dfXSize;
     591       28888 :         dfYSize = psExtraArg->dfYSize;
     592             :     }
     593             : 
     594             :     /* -------------------------------------------------------------------- */
     595             :     /*      Compute stepping increment.                                     */
     596             :     /* -------------------------------------------------------------------- */
     597      363491 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     598      363491 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     599      363491 :     CPLErr eErr = CE_None;
     600             : 
     601      363491 :     if (eRWFlag == GF_Write)
     602             :     {
     603             :         /* --------------------------------------------------------------------
     604             :          */
     605             :         /*    Write case */
     606             :         /*    Loop over raster window computing source locations in the buffer.
     607             :          */
     608             :         /* --------------------------------------------------------------------
     609             :          */
     610      166650 :         GByte *pabyDstBlock = nullptr;
     611      166650 :         int nLBlockX = -1;
     612      166650 :         int nLBlockY = -1;
     613             : 
     614     1259590 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     615             :         {
     616     1092940 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     617             : 
     618    12063600 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     619             :             {
     620    10970600 :                 const int iBufXOff =
     621    10970600 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     622    10970600 :                 GPtrDiff_t iBufOffset =
     623    10970600 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     624             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     625    10970600 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     626             : 
     627             :                 // FIXME: this code likely doesn't work if the dirty block gets
     628             :                 // flushed to disk before being completely written.
     629             :                 // In the meantime, bJustInitialize should probably be set to
     630             :                 // FALSE even if it is not ideal performance wise, and for
     631             :                 // lossy compression.
     632             : 
     633             :                 /* --------------------------------------------------------------------
     634             :                  */
     635             :                 /*      Ensure we have the appropriate block loaded. */
     636             :                 /* --------------------------------------------------------------------
     637             :                  */
     638    10970600 :                 if (iDstX < nLBlockX * nBlockXSize ||
     639    10721300 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     640    10264600 :                     iDstY < nLBlockY * nBlockYSize ||
     641    10264600 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     642             :                 {
     643      738642 :                     nLBlockX = iDstX / nBlockXSize;
     644      738642 :                     nLBlockY = iDstY / nBlockYSize;
     645             : 
     646      738642 :                     const bool bJustInitialize =
     647     1065870 :                         nYOff <= nLBlockY * nBlockYSize &&
     648      327231 :                         nYOff + nYSize - nBlockYSize >=
     649      327231 :                             nLBlockY * nBlockYSize &&
     650     1116140 :                         nXOff <= nLBlockX * nBlockXSize &&
     651       50265 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     652             :                     /*bool bMemZeroBuffer = FALSE;
     653             :                     if( !bJustInitialize &&
     654             :                         nXOff <= nLBlockX * nBlockXSize &&
     655             :                         nYOff <= nLBlockY * nBlockYSize &&
     656             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     657             :                          (nXOff + nXSize == GetXSize() &&
     658             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     659             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     660             :                          (nYOff + nYSize == GetYSize() &&
     661             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     662             :                     {
     663             :                         bJustInitialize = TRUE;
     664             :                         bMemZeroBuffer = TRUE;
     665             :                     }*/
     666      738642 :                     if (poBlock != nullptr)
     667      571992 :                         poBlock->DropLock();
     668             : 
     669      738642 :                     poBlock =
     670      738642 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     671      738642 :                     if (poBlock == nullptr)
     672             :                     {
     673           0 :                         return (CE_Failure);
     674             :                     }
     675             : 
     676      738642 :                     poBlock->MarkDirty();
     677             : 
     678      738642 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     679             :                     /*if( bMemZeroBuffer )
     680             :                     {
     681             :                         memset(pabyDstBlock, 0,
     682             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     683             :                     * nBlockYSize);
     684             :                     }*/
     685             :                 }
     686             : 
     687             :                 // To make Coverity happy. Should not happen by design.
     688    10970600 :                 if (pabyDstBlock == nullptr)
     689             :                 {
     690           0 :                     CPLAssert(false);
     691             :                     eErr = CE_Failure;
     692             :                     break;
     693             :                 }
     694             : 
     695             :                 /* --------------------------------------------------------------------
     696             :                  */
     697             :                 /*      Copy over this pixel of data. */
     698             :                 /* --------------------------------------------------------------------
     699             :                  */
     700    10970600 :                 GPtrDiff_t iDstOffset =
     701    10970600 :                     (static_cast<GPtrDiff_t>(iDstX) -
     702    10970600 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     703    10970600 :                      (static_cast<GPtrDiff_t>(iDstY) -
     704    10970600 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     705    10970600 :                          nBlockXSize) *
     706    10970600 :                     nBandDataSize;
     707             : 
     708    10970600 :                 if (eDataType == eBufType)
     709             :                 {
     710    10967500 :                     memcpy(pabyDstBlock + iDstOffset,
     711    10967500 :                            static_cast<GByte *>(pData) + iBufOffset,
     712             :                            nBandDataSize);
     713             :                 }
     714             :                 else
     715             :                 {
     716             :                     /* type to type conversion ... ouch, this is expensive way
     717             :                     of handling single words */
     718             : 
     719        3096 :                     GDALCopyWords(static_cast<GByte *>(pData) + iBufOffset,
     720        3096 :                                   eBufType, 0, pabyDstBlock + iDstOffset,
     721             :                                   eDataType, 0, 1);
     722             :                 }
     723             :             }
     724             : 
     725     1092940 :             if (psExtraArg->pfnProgress != nullptr &&
     726           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     727             :                                          psExtraArg->pProgressData))
     728             :             {
     729           0 :                 eErr = CE_Failure;
     730           0 :                 break;
     731             :             }
     732             :         }
     733             :     }
     734             :     else
     735             :     {
     736      196841 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     737             :         {
     738        7484 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     739        2452 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     740        2450 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     741        5037 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     742        2438 :                 GetColorTable() != nullptr)
     743             :             {
     744           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     745             :                          "Resampling method not supported on paletted band. "
     746             :                          "Falling back to nearest neighbour");
     747             :             }
     748        2519 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     749           3 :                      GDALDataTypeIsComplex(eDataType))
     750             :             {
     751           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     752             :                          "Resampling method not supported on complex data type "
     753             :                          "band. Falling back to nearest neighbour");
     754             :             }
     755             :             else
     756             :             {
     757        2516 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     758             :                                          pData, nBufXSize, nBufYSize, eBufType,
     759        2516 :                                          nPixelSpace, nLineSpace, psExtraArg);
     760             :             }
     761             :         }
     762             : 
     763      194324 :         int nLimitBlockY = 0;
     764      194324 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     765      194324 :         int nStartBlockX = -nBlockXSize;
     766      194324 :         const double EPS = 1e-10;
     767      194324 :         int nLBlockY = -1;
     768      194324 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     769      194324 :         const bool bIntegerXFactor =
     770      170831 :             bUseIntegerRequestCoords &&
     771      266279 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     772       71955 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     773             : 
     774             :         /* --------------------------------------------------------------------
     775             :          */
     776             :         /*      Read case */
     777             :         /*      Loop over buffer computing source locations. */
     778             :         /* --------------------------------------------------------------------
     779             :          */
     780     1878270 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     781             :         {
     782             :             // Add small epsilon to avoid some numeric precision issues.
     783     1683960 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     784     1683960 :             const int iSrcY = static_cast<int>(std::min(
     785     1683960 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     786             : 
     787     1683960 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     788             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     789             : 
     790     1683960 :             if (iSrcY >= nLimitBlockY)
     791             :             {
     792      234792 :                 nLBlockY = iSrcY / nBlockYSize;
     793      234792 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     794      234792 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     795      234792 :                     nLimitBlockY += nBlockYSize;
     796             :                 else
     797           0 :                     nLimitBlockY = INT_MAX;
     798             :                 // Make sure a new block is loaded.
     799      234792 :                 nStartBlockX = -nBlockXSize;
     800             :             }
     801     1449170 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     802             :             {
     803             :                 // Make sure a new block is loaded.
     804      429498 :                 nStartBlockX = -nBlockXSize;
     805             :             }
     806             : 
     807     1683960 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     808     1683960 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     809             : 
     810     1683960 :             if (bIntegerXFactor)
     811             :             {
     812      413154 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     813      413154 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     814      413154 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     815      413154 :                 bool bRet = false;
     816      413154 :                 if (bByteCopy)
     817             :                 {
     818      302831 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     819             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     820             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_Byte,
     821             :                         GDT_Byte, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     822             :                 }
     823      110323 :                 else if (eDataType == eBufType)
     824             :                 {
     825      110114 :                     switch (nBandDataSize)
     826             :                     {
     827      110090 :                         case 2:
     828      110090 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     829             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     830             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     831             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     832             :                                 nBlockXSize, poBlock, nLBlockY);
     833      110090 :                             break;
     834          16 :                         case 4:
     835          16 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     836             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     837             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     838             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     839             :                                 nBlockXSize, poBlock, nLBlockY);
     840          16 :                             break;
     841           8 :                         case 8:
     842           8 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     843             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     844             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     845             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     846             :                                 nBlockXSize, poBlock, nLBlockY);
     847           8 :                             break;
     848           0 :                         case 16:
     849           0 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     850             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     851             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     852             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     853             :                                 nBlockXSize, poBlock, nLBlockY);
     854           0 :                             break;
     855           0 :                         default:
     856           0 :                             CPLAssert(false);
     857             :                             break;
     858             :                     }
     859             :                 }
     860             :                 else
     861             :                 {
     862         209 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     863             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     864             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     865             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     866             :                 }
     867      413154 :                 if (!bRet)
     868           1 :                     eErr = CE_Failure;
     869             :             }
     870             :             else
     871             :             {
     872     1270800 :                 double dfSrcX = dfSrcXStart;
     873   444966000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     874   443695000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     875             :                 {
     876             :                     // TODO?: try to avoid the clamping for most iterations
     877             :                     const int iSrcX = static_cast<int>(
     878   887390000 :                         std::min(std::max(0.0, dfSrcX),
     879   443695000 :                                  static_cast<double>(nRasterXSize - 1)));
     880             : 
     881             :                     /* --------------------------------------------------------------------
     882             :                      */
     883             :                     /*      Ensure we have the appropriate block loaded. */
     884             :                     /* --------------------------------------------------------------------
     885             :                      */
     886   443695000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     887             :                     {
     888     1704850 :                         const int nLBlockX = iSrcX / nBlockXSize;
     889     1704850 :                         nStartBlockX = nLBlockX * nBlockXSize;
     890             : 
     891     1704850 :                         if (poBlock != nullptr)
     892     1582480 :                             poBlock->DropLock();
     893             : 
     894     1704850 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     895     1704850 :                         if (poBlock == nullptr)
     896             :                         {
     897           9 :                             eErr = CE_Failure;
     898           9 :                             break;
     899             :                         }
     900             : 
     901             :                         pabySrcBlock =
     902     1704840 :                             static_cast<GByte *>(poBlock->GetDataRef());
     903             :                     }
     904   443695000 :                     const GPtrDiff_t nDiffX =
     905   443695000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     906             : 
     907             :                     /* --------------------------------------------------------------------
     908             :                      */
     909             :                     /*      Copy over this pixel of data. */
     910             :                     /* --------------------------------------------------------------------
     911             :                      */
     912             : 
     913   443695000 :                     if (bByteCopy)
     914             :                     {
     915   390257000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     916   390257000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     917   390257000 :                             pabySrcBlock[iSrcOffset];
     918             :                     }
     919    53437500 :                     else if (eDataType == eBufType)
     920             :                     {
     921    48215500 :                         GPtrDiff_t iSrcOffset =
     922    48215500 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     923    48215500 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     924    48215500 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     925             :                     }
     926             :                     else
     927             :                     {
     928             :                         // Type to type conversion ...
     929     5222010 :                         GPtrDiff_t iSrcOffset =
     930     5222010 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     931     5222010 :                         GDALCopyWords(pabySrcBlock + iSrcOffset, eDataType, 0,
     932     5222010 :                                       static_cast<GByte *>(pData) + iBufOffset,
     933             :                                       eBufType, 0, 1);
     934             :                     }
     935             : 
     936   443695000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     937             :                 }
     938             :             }
     939     1683960 :             if (eErr == CE_Failure)
     940          11 :                 break;
     941             : 
     942     1686830 :             if (psExtraArg->pfnProgress != nullptr &&
     943        2878 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     944             :                                          psExtraArg->pProgressData))
     945             :             {
     946           1 :                 eErr = CE_Failure;
     947           1 :                 break;
     948             :             }
     949             :         }
     950             :     }
     951             : 
     952      360974 :     if (poBlock != nullptr)
     953      360964 :         poBlock->DropLock();
     954             : 
     955      360974 :     return eErr;
     956             : }
     957             : 
     958             : /************************************************************************/
     959             : /*                         GDALRasterIOTransformer()                    */
     960             : /************************************************************************/
     961             : 
     962             : struct GDALRasterIOTransformerStruct
     963             : {
     964             :     double dfXOff;
     965             :     double dfYOff;
     966             :     double dfXRatioDstToSrc;
     967             :     double dfYRatioDstToSrc;
     968             : };
     969             : 
     970        6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
     971             :                                    int nPointCount, double *x, double *y,
     972             :                                    double * /* z */, int *panSuccess)
     973             : {
     974        6748 :     GDALRasterIOTransformerStruct *psParams =
     975             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
     976        6748 :     if (bDstToSrc)
     977             :     {
     978      252996 :         for (int i = 0; i < nPointCount; i++)
     979             :         {
     980      246836 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
     981      246836 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
     982      246836 :             panSuccess[i] = TRUE;
     983             :         }
     984             :     }
     985             :     else
     986             :     {
     987        1176 :         for (int i = 0; i < nPointCount; i++)
     988             :         {
     989         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
     990         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
     991         588 :             panSuccess[i] = TRUE;
     992             :         }
     993             :     }
     994        6748 :     return TRUE;
     995             : }
     996             : 
     997             : /************************************************************************/
     998             : /*                          RasterIOResampled()                         */
     999             : /************************************************************************/
    1000             : 
    1001             : //! @cond Doxygen_Suppress
    1002        2516 : CPLErr GDALRasterBand::RasterIOResampled(
    1003             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1004             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1005             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1006             : {
    1007             :     // Determine if we use warping resampling or overview resampling
    1008        2516 :     bool bUseWarp = false;
    1009        2516 :     if (GDALDataTypeIsComplex(eDataType))
    1010         149 :         bUseWarp = true;
    1011             : 
    1012        2516 :     double dfXOff = nXOff;
    1013        2516 :     double dfYOff = nYOff;
    1014        2516 :     double dfXSize = nXSize;
    1015        2516 :     double dfYSize = nYSize;
    1016        2516 :     if (psExtraArg->bFloatingPointWindowValidity)
    1017             :     {
    1018        2105 :         dfXOff = psExtraArg->dfXOff;
    1019        2105 :         dfYOff = psExtraArg->dfYOff;
    1020        2105 :         dfXSize = psExtraArg->dfXSize;
    1021        2105 :         dfYSize = psExtraArg->dfYSize;
    1022             :     }
    1023             : 
    1024        2516 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1025        2516 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1026             : 
    1027             :     // Determine the coordinates in the "virtual" output raster to see
    1028             :     // if there are not integers, in which case we will use them as a shift
    1029             :     // so that subwindow extracts give the exact same results as entire raster
    1030             :     // scaling.
    1031        2516 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1032        2516 :     bool bHasXOffVirtual = false;
    1033        2516 :     int nDestXOffVirtual = 0;
    1034        2516 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1035             :     {
    1036        2191 :         bHasXOffVirtual = true;
    1037        2191 :         dfXOff = nXOff;
    1038        2191 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1039             :     }
    1040             : 
    1041        2516 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1042        2516 :     bool bHasYOffVirtual = false;
    1043        2516 :     int nDestYOffVirtual = 0;
    1044        2516 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1045             :     {
    1046        2185 :         bHasYOffVirtual = true;
    1047        2185 :         dfYOff = nYOff;
    1048        2185 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1049             :     }
    1050             : 
    1051             :     // Create a MEM dataset that wraps the output buffer.
    1052             :     GDALDataset *poMEMDS;
    1053        2516 :     void *pTempBuffer = nullptr;
    1054        2516 :     GSpacing nPSMem = nPixelSpace;
    1055        2516 :     GSpacing nLSMem = nLineSpace;
    1056        2516 :     void *pDataMem = pData;
    1057        2516 :     GDALDataType eDTMem = eBufType;
    1058        2516 :     if (eBufType != eDataType)
    1059             :     {
    1060          28 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1061          28 :         nLSMem = nPSMem * nBufXSize;
    1062             :         pTempBuffer =
    1063          28 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1064          28 :         if (pTempBuffer == nullptr)
    1065           0 :             return CE_Failure;
    1066          28 :         pDataMem = pTempBuffer;
    1067          28 :         eDTMem = eDataType;
    1068             :     }
    1069             : 
    1070             :     poMEMDS =
    1071        2516 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1072             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1073        2516 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1074        2516 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1075        2516 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1076             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1077        2516 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1078             : 
    1079        2516 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1080        2516 :     if (pszNBITS)
    1081           6 :         reinterpret_cast<GDALRasterBand *>(hMEMBand)->SetMetadataItem(
    1082           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1083             : 
    1084        2516 :     CPLErr eErr = CE_None;
    1085             : 
    1086             :     // Do the resampling.
    1087        2516 :     if (bUseWarp)
    1088             :     {
    1089         149 :         int bHasNoData = FALSE;
    1090         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1091             : 
    1092         149 :         VRTDatasetH hVRTDS = nullptr;
    1093         149 :         GDALRasterBandH hVRTBand = nullptr;
    1094         149 :         if (GetDataset() == nullptr)
    1095             :         {
    1096             :             /* Create VRT dataset that wraps the whole dataset */
    1097           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1098           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1099           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1100           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1101             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1102             :                                VRT_NODATA_UNSET);
    1103             : 
    1104             :             /* Add a mask band if needed */
    1105           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1106             :             {
    1107           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1108             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1109             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1110             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1111           0 :                             ->GetMaskBand());
    1112           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1113           0 :                                                  nRasterYSize, 0, 0,
    1114           0 :                                                  nRasterXSize, nRasterYSize);
    1115             :             }
    1116             :         }
    1117             : 
    1118         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1119         149 :         switch (psExtraArg->eResampleAlg)
    1120             :         {
    1121           0 :             case GRIORA_NearestNeighbour:
    1122           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1123           0 :                 break;
    1124         147 :             case GRIORA_Bilinear:
    1125         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1126         147 :                 break;
    1127           0 :             case GRIORA_Cubic:
    1128           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1129           0 :                 break;
    1130           0 :             case GRIORA_CubicSpline:
    1131           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1132           0 :                 break;
    1133           0 :             case GRIORA_Lanczos:
    1134           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1135           0 :                 break;
    1136           0 :             case GRIORA_Average:
    1137           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1138           0 :                 break;
    1139           2 :             case GRIORA_RMS:
    1140           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1141           2 :                 break;
    1142           0 :             case GRIORA_Mode:
    1143           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1144           0 :                 break;
    1145           0 :             default:
    1146           0 :                 CPLAssert(false);
    1147             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1148             :                 break;
    1149             :         }
    1150         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1151         149 :         psWarpOptions->hDstDS = poMEMDS;
    1152         149 :         psWarpOptions->nBandCount = 1;
    1153         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1154         149 :         int nDstBandNumber = 1;
    1155         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1156         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1157         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1158         149 :                                          ? psExtraArg->pfnProgress
    1159             :                                          : GDALDummyProgress;
    1160         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1161         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1162         149 :         if (bHasNoData)
    1163             :         {
    1164           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1165             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1166           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1167             :             {
    1168           0 :                 psWarpOptions->padfSrcNoDataReal =
    1169           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1170           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1171             :             }
    1172             : 
    1173           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1174             :             {
    1175           0 :                 psWarpOptions->padfDstNoDataReal =
    1176           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1177           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1178             :             }
    1179             :         }
    1180             : 
    1181             :         GDALRasterIOTransformerStruct sTransformer;
    1182         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1183         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1184         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1185         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1186         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1187             : 
    1188             :         GDALWarpOperationH hWarpOperation =
    1189         149 :             GDALCreateWarpOperation(psWarpOptions);
    1190         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1191             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1192         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1193             : 
    1194         149 :         psWarpOptions->panSrcBands = nullptr;
    1195         149 :         psWarpOptions->panDstBands = nullptr;
    1196         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1197             : 
    1198         149 :         if (hVRTDS)
    1199           0 :             GDALClose(hVRTDS);
    1200             :     }
    1201             :     else
    1202             :     {
    1203        2367 :         const char *pszResampling =
    1204        2514 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1205         230 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1206         164 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1207         157 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1208         103 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1209          39 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1210          15 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1211           3 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1212             :                                                                : "UNKNOWN";
    1213             : 
    1214        2367 :         int nKernelRadius = 0;
    1215             :         GDALResampleFunction pfnResampleFunc =
    1216        2367 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1217        2367 :         CPLAssert(pfnResampleFunc);
    1218             :         GDALDataType eWrkDataType =
    1219        2367 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1220        2367 :         int nHasNoData = 0;
    1221        2367 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1222        2367 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1223        2367 :         if (!bHasNoData)
    1224        2310 :             dfNoDataValue = 0.0;
    1225             : 
    1226        2367 :         int nDstBlockXSize = nBufXSize;
    1227        2367 :         int nDstBlockYSize = nBufYSize;
    1228        2367 :         int nFullResXChunk = 0;
    1229        2367 :         int nFullResYChunk = 0;
    1230             :         while (true)
    1231             :         {
    1232        2367 :             nFullResXChunk =
    1233        2367 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1234        2367 :             nFullResYChunk =
    1235        2367 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1236        2367 :             if (nFullResXChunk > nRasterXSize)
    1237        2178 :                 nFullResXChunk = nRasterXSize;
    1238        2367 :             if (nFullResYChunk > nRasterYSize)
    1239         161 :                 nFullResYChunk = nRasterYSize;
    1240        2367 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1241        2352 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1242             :                  1024 * 1024))
    1243             :                 break;
    1244             :             // When operating on the full width of a raster whose block width is
    1245             :             // the raster width, prefer doing chunks in height.
    1246           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1247             :                 nDstBlockYSize > 1)
    1248           0 :                 nDstBlockYSize /= 2;
    1249             :             /* Otherwise cut the maximal dimension */
    1250           0 :             else if (nDstBlockXSize > 1 &&
    1251           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1252           0 :                 nDstBlockXSize /= 2;
    1253             :             else
    1254           0 :                 nDstBlockYSize /= 2;
    1255             :         }
    1256             : 
    1257        2367 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1258        2367 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1259        2367 :         if (nOvrXFactor == 0)
    1260        2016 :             nOvrXFactor = 1;
    1261        2367 :         if (nOvrYFactor == 0)
    1262        2015 :             nOvrYFactor = 1;
    1263        2367 :         int nFullResXSizeQueried =
    1264        2367 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1265        2367 :         int nFullResYSizeQueried =
    1266        2367 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1267             : 
    1268        2367 :         if (nFullResXSizeQueried > nRasterXSize)
    1269        2108 :             nFullResXSizeQueried = nRasterXSize;
    1270        2367 :         if (nFullResYSizeQueried > nRasterYSize)
    1271         102 :             nFullResYSizeQueried = nRasterYSize;
    1272             : 
    1273             :         void *pChunk =
    1274        2367 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1275             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1276        2367 :         GByte *pabyChunkNoDataMask = nullptr;
    1277             : 
    1278        2367 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1279        2367 :         int l_nMaskFlags = GetMaskFlags();
    1280             : 
    1281        2367 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1282        2367 :         if (bUseNoDataMask)
    1283             :         {
    1284         113 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1285             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1286             :         }
    1287        2367 :         if (pChunk == nullptr ||
    1288         113 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1289             :         {
    1290           0 :             GDALClose(poMEMDS);
    1291           0 :             CPLFree(pChunk);
    1292           0 :             CPLFree(pabyChunkNoDataMask);
    1293           0 :             VSIFree(pTempBuffer);
    1294           0 :             return CE_Failure;
    1295             :         }
    1296             : 
    1297        2367 :         int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
    1298        2367 :                            ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
    1299        2367 :         int nBlocksDone = 0;
    1300             : 
    1301             :         int nDstYOff;
    1302        4734 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1303        2367 :              nDstYOff += nDstBlockYSize)
    1304             :         {
    1305             :             int nDstYCount;
    1306        2367 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1307        2367 :                 nDstYCount = nDstBlockYSize;
    1308             :             else
    1309           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1310             : 
    1311        2367 :             int nChunkYOff =
    1312        2367 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1313        2367 :             int nChunkYOff2 = nYOff + 1 +
    1314        2367 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1315             :                                                     dfYRatioDstToSrc));
    1316        2367 :             if (nChunkYOff2 > nRasterYSize)
    1317         269 :                 nChunkYOff2 = nRasterYSize;
    1318        2367 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1319        2367 :             CPLAssert(nYCount <= nFullResYChunk);
    1320             : 
    1321        2367 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1322        2367 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1323        2367 :             if (nChunkYOffQueried < 0)
    1324             :             {
    1325         204 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1326         204 :                 nChunkYOffQueried = 0;
    1327             :             }
    1328        2367 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1329         304 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1330        2367 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1331             : 
    1332        2367 :             int nDstXOff = 0;
    1333        4734 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1334        2367 :                  nDstXOff += nDstBlockXSize)
    1335             :             {
    1336        2367 :                 int nDstXCount = 0;
    1337        2367 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1338        2367 :                     nDstXCount = nDstBlockXSize;
    1339             :                 else
    1340           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1341             : 
    1342        2367 :                 int nChunkXOff =
    1343        2367 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1344        2367 :                 int nChunkXOff2 =
    1345        2367 :                     nXOff + 1 +
    1346        2367 :                     static_cast<int>(
    1347        2367 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1348        2367 :                 if (nChunkXOff2 > nRasterXSize)
    1349        2179 :                     nChunkXOff2 = nRasterXSize;
    1350        2367 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1351        2367 :                 CPLAssert(nXCount <= nFullResXChunk);
    1352             : 
    1353        2367 :                 int nChunkXOffQueried =
    1354        2367 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1355        2367 :                 int nChunkXSizeQueried =
    1356        2367 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1357        2367 :                 if (nChunkXOffQueried < 0)
    1358             :                 {
    1359        2121 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1360        2121 :                     nChunkXOffQueried = 0;
    1361             :                 }
    1362        2367 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1363        2107 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1364        2367 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1365             : 
    1366             :                 // Read the source buffers.
    1367        2367 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1368             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1369             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1370             :                                 eWrkDataType, 0, 0, nullptr);
    1371             : 
    1372        2367 :                 bool bSkipResample = false;
    1373        2367 :                 bool bNoDataMaskFullyOpaque = false;
    1374        2367 :                 if (eErr == CE_None && bUseNoDataMask)
    1375             :                 {
    1376         113 :                     eErr = poMaskBand->RasterIO(
    1377             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1378             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1379             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1380             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1381             : 
    1382             :                     /* Optimizations if mask if fully opaque or transparent */
    1383         113 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1384         113 :                     GByte bVal = pabyChunkNoDataMask[0];
    1385         113 :                     int i = 1;
    1386      235275 :                     for (; i < nPixels; i++)
    1387             :                     {
    1388      235232 :                         if (pabyChunkNoDataMask[i] != bVal)
    1389          70 :                             break;
    1390             :                     }
    1391         113 :                     if (i == nPixels)
    1392             :                     {
    1393          43 :                         if (bVal == 0)
    1394             :                         {
    1395         712 :                             for (int j = 0; j < nDstYCount; j++)
    1396             :                             {
    1397         686 :                                 GDALCopyWords(&dfNoDataValue, GDT_Float64, 0,
    1398             :                                               static_cast<GByte *>(pDataMem) +
    1399         686 :                                                   nLSMem * (j + nDstYOff) +
    1400         686 :                                                   nDstXOff * nPSMem,
    1401             :                                               eDTMem, static_cast<int>(nPSMem),
    1402             :                                               nDstXCount);
    1403             :                             }
    1404          26 :                             bSkipResample = true;
    1405             :                         }
    1406             :                         else
    1407             :                         {
    1408          17 :                             bNoDataMaskFullyOpaque = true;
    1409             :                         }
    1410             :                     }
    1411             :                 }
    1412             : 
    1413        2367 :                 if (!bSkipResample && eErr == CE_None)
    1414             :                 {
    1415        2339 :                     const bool bPropagateNoData = false;
    1416        2339 :                     void *pDstBuffer = nullptr;
    1417        2339 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1418             :                     GDALRasterBand *poMEMBand =
    1419        2339 :                         GDALRasterBand::FromHandle(hMEMBand);
    1420        2339 :                     eErr = pfnResampleFunc(
    1421             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1422             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1423             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1424             :                         eWrkDataType, pChunk,
    1425             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1426        2339 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1427             :                         nChunkXSizeQueried,
    1428        2339 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1429             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1430        2339 :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1431             :                         nDstYOff + nDestYOffVirtual,
    1432        2339 :                         nDstYOff + nDestYOffVirtual + nDstYCount, poMEMBand,
    1433             :                         &pDstBuffer, &eDstBufferDataType, pszResampling,
    1434        2339 :                         bHasNoData, dfNoDataValue, GetColorTable(), eDataType,
    1435             :                         bPropagateNoData);
    1436        2339 :                     if (eErr == CE_None)
    1437             :                     {
    1438        2339 :                         eErr = poMEMBand->RasterIO(
    1439             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1440             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1441             :                             pDstBuffer, nDstXCount, nDstYCount,
    1442             :                             eDstBufferDataType, 0, 0, nullptr);
    1443             :                     }
    1444        2339 :                     CPLFree(pDstBuffer);
    1445             :                 }
    1446             : 
    1447        2367 :                 nBlocksDone++;
    1448        2378 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1449          11 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1450             :                                              "", psExtraArg->pProgressData))
    1451             :                 {
    1452           1 :                     eErr = CE_Failure;
    1453             :                 }
    1454             :             }
    1455             :         }
    1456             : 
    1457        2367 :         CPLFree(pChunk);
    1458        2367 :         CPLFree(pabyChunkNoDataMask);
    1459             :     }
    1460             : 
    1461        2516 :     if (eBufType != eDataType)
    1462             :     {
    1463          28 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1464             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1465             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1466             :             nullptr));
    1467             :     }
    1468        2516 :     GDALClose(poMEMDS);
    1469        2516 :     VSIFree(pTempBuffer);
    1470             : 
    1471        2516 :     return eErr;
    1472             : }
    1473             : 
    1474             : /************************************************************************/
    1475             : /*                          RasterIOResampled()                         */
    1476             : /************************************************************************/
    1477             : 
    1478         280 : CPLErr GDALDataset::RasterIOResampled(GDALRWFlag /* eRWFlag */, int nXOff,
    1479             :                                       int nYOff, int nXSize, int nYSize,
    1480             :                                       void *pData, int nBufXSize, int nBufYSize,
    1481             :                                       GDALDataType eBufType, int nBandCount,
    1482             :                                       int *panBandMap, GSpacing nPixelSpace,
    1483             :                                       GSpacing nLineSpace, GSpacing nBandSpace,
    1484             :                                       GDALRasterIOExtraArg *psExtraArg)
    1485             : 
    1486             : {
    1487             : #if 0
    1488             :     // Determine if we use warping resampling or overview resampling
    1489             :     bool bUseWarp = false;
    1490             :     if( GDALDataTypeIsComplex( eDataType ) )
    1491             :         bUseWarp = true;
    1492             : #endif
    1493             : 
    1494         280 :     double dfXOff = nXOff;
    1495         280 :     double dfYOff = nYOff;
    1496         280 :     double dfXSize = nXSize;
    1497         280 :     double dfYSize = nYSize;
    1498         280 :     if (psExtraArg->bFloatingPointWindowValidity)
    1499             :     {
    1500         158 :         dfXOff = psExtraArg->dfXOff;
    1501         158 :         dfYOff = psExtraArg->dfYOff;
    1502         158 :         dfXSize = psExtraArg->dfXSize;
    1503         158 :         dfYSize = psExtraArg->dfYSize;
    1504             :     }
    1505             : 
    1506         280 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1507         280 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1508             : 
    1509             :     // Determine the coordinates in the "virtual" output raster to see
    1510             :     // if there are not integers, in which case we will use them as a shift
    1511             :     // so that subwindow extracts give the exact same results as entire raster
    1512             :     // scaling.
    1513         280 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1514         280 :     bool bHasXOffVirtual = false;
    1515         280 :     int nDestXOffVirtual = 0;
    1516         280 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1517             :     {
    1518         162 :         bHasXOffVirtual = true;
    1519         162 :         dfXOff = nXOff;
    1520         162 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1521             :     }
    1522             : 
    1523         280 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1524         280 :     bool bHasYOffVirtual = false;
    1525         280 :     int nDestYOffVirtual = 0;
    1526         280 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1527             :     {
    1528         124 :         bHasYOffVirtual = true;
    1529         124 :         dfYOff = nYOff;
    1530         124 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1531             :     }
    1532             : 
    1533             :     // Create a MEM dataset that wraps the output buffer.
    1534             :     GDALDataset *poMEMDS =
    1535         280 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1536             :                            nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
    1537             :     GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
    1538         275 :         CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
    1539        1190 :     for (int i = 0; i < nBandCount; i++)
    1540             :     {
    1541         917 :         char szBuffer[32] = {'\0'};
    1542        1856 :         int nRet = CPLPrintPointer(
    1543             :             szBuffer,
    1544         917 :             static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
    1545         917 :                 nLineSpace * nDestYOffVirtual + nBandSpace * i,
    1546             :             sizeof(szBuffer));
    1547         939 :         szBuffer[nRet] = 0;
    1548             : 
    1549         939 :         char szBuffer0[64] = {'\0'};
    1550         939 :         snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
    1551             : 
    1552         939 :         char szBuffer1[64] = {'\0'};
    1553         939 :         snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
    1554             :                  static_cast<GIntBig>(nPixelSpace));
    1555             : 
    1556         939 :         char szBuffer2[64] = {'\0'};
    1557         939 :         snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
    1558             :                  static_cast<GIntBig>(nLineSpace));
    1559             : 
    1560         939 :         char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
    1561             : 
    1562         939 :         poMEMDS->AddBand(eBufType, apszOptions);
    1563             : 
    1564         944 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1565         928 :         papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
    1566             :         const char *pszNBITS =
    1567         928 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1568         916 :         if (pszNBITS)
    1569           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1570           0 :                                                            "IMAGE_STRUCTURE");
    1571             :     }
    1572             : 
    1573         273 :     CPLErr eErr = CE_None;
    1574             : 
    1575             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1576             :     // Looks like this code was initially added as disable by copying
    1577             :     // from RasterIO here:
    1578             :     // https://trac.osgeo.org/gdal/changeset/29572
    1579             : #if 0
    1580             :     // Do the resampling.
    1581             :     if( bUseWarp )
    1582             :     {
    1583             :         VRTDatasetH hVRTDS = nullptr;
    1584             :         GDALRasterBandH hVRTBand = nullptr;
    1585             :         if( GetDataset() == nullptr )
    1586             :         {
    1587             :             /* Create VRT dataset that wraps the whole dataset */
    1588             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1589             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1590             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1591             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1592             :                                 (GDALRasterBandH)this,
    1593             :                                 0, 0,
    1594             :                                 nRasterXSize, nRasterYSize,
    1595             :                                 0, 0,
    1596             :                                 nRasterXSize, nRasterYSize,
    1597             :                                 nullptr, VRT_NODATA_UNSET );
    1598             : 
    1599             :             /* Add a mask band if needed */
    1600             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1601             :             {
    1602             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1603             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1604             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1605             :                 poVRTMaskBand->
    1606             :                     AddMaskBandSource( this,
    1607             :                                     0, 0,
    1608             :                                     nRasterXSize, nRasterYSize,
    1609             :                                     0, 0,
    1610             :                                     nRasterXSize, nRasterYSize);
    1611             :             }
    1612             :         }
    1613             : 
    1614             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1615             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1616             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1617             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1618             :         psWarpOptions->nBandCount = 1;
    1619             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1620             :         int nDstBandNumber = 1;
    1621             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1622             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1623             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1624             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1625             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1626             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1627             :         GDALRasterIOTransformerStruct sTransformer;
    1628             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1629             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1630             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1631             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1632             :         psWarpOptions->pTransformerArg = &sTransformer;
    1633             : 
    1634             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1635             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1636             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1637             :                                       nBufXSize, nBufYSize );
    1638             :         GDALDestroyWarpOperation( hWarpOperation );
    1639             : 
    1640             :         psWarpOptions->panSrcBands = nullptr;
    1641             :         psWarpOptions->panDstBands = nullptr;
    1642             :         GDALDestroyWarpOptions( psWarpOptions );
    1643             : 
    1644             :         if( hVRTDS )
    1645             :             GDALClose(hVRTDS);
    1646             :     }
    1647             :     else
    1648             : #endif
    1649             :     {
    1650         273 :         const char *pszResampling =
    1651         430 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1652         157 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1653           0 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1654           0 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1655           0 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1656           0 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1657           0 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1658           0 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1659             :                                                                : "UNKNOWN";
    1660             : 
    1661         273 :         GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1662         266 :         GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1663             :         int nBlockXSize, nBlockYSize;
    1664         270 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1665             : 
    1666             :         int nKernelRadius;
    1667             :         GDALResampleFunction pfnResampleFunc =
    1668         266 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1669         255 :         CPLAssert(pfnResampleFunc);
    1670             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1671             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1672             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1673             : #endif
    1674             :         GDALDataType eWrkDataType =
    1675         255 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1676             : 
    1677         259 :         int nDstBlockXSize = nBufXSize;
    1678         259 :         int nDstBlockYSize = nBufYSize;
    1679             :         int nFullResXChunk, nFullResYChunk;
    1680             :         while (true)
    1681             :         {
    1682         259 :             nFullResXChunk =
    1683         259 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1684         259 :             nFullResYChunk =
    1685         259 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1686         259 :             if (nFullResXChunk > nRasterXSize)
    1687         138 :                 nFullResXChunk = nRasterXSize;
    1688         259 :             if (nFullResYChunk > nRasterYSize)
    1689          33 :                 nFullResYChunk = nRasterYSize;
    1690         259 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1691         257 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1692             :                  1024 * 1024))
    1693             :                 break;
    1694             :             // When operating on the full width of a raster whose block width is
    1695             :             // the raster width, prefer doing chunks in height.
    1696           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1697             :                 nDstBlockYSize > 1)
    1698           0 :                 nDstBlockYSize /= 2;
    1699             :             /* Otherwise cut the maximal dimension */
    1700           0 :             else if (nDstBlockXSize > 1 &&
    1701           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1702           0 :                 nDstBlockXSize /= 2;
    1703             :             else
    1704           0 :                 nDstBlockYSize /= 2;
    1705             :         }
    1706             : 
    1707         523 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1708         259 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1709         264 :         if (nOvrFactor == 0)
    1710          93 :             nOvrFactor = 1;
    1711         264 :         int nFullResXSizeQueried =
    1712         264 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1713         264 :         int nFullResYSizeQueried =
    1714         264 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1715             : 
    1716         264 :         if (nFullResXSizeQueried > nRasterXSize)
    1717         147 :             nFullResXSizeQueried = nRasterXSize;
    1718         264 :         if (nFullResYSizeQueried > nRasterYSize)
    1719          36 :             nFullResYSizeQueried = nRasterYSize;
    1720             : 
    1721         264 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1722             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1723             :                               nBandCount),
    1724             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1725         276 :         GByte *pabyChunkNoDataMask = nullptr;
    1726             : 
    1727         276 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1728         273 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1729             : 
    1730         276 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1731         276 :         if (bUseNoDataMask)
    1732             :         {
    1733          55 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1734             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1735             :         }
    1736         276 :         if (pChunk == nullptr ||
    1737          55 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1738             :         {
    1739          11 :             GDALClose(poMEMDS);
    1740           0 :             CPLFree(pChunk);
    1741           0 :             CPLFree(pabyChunkNoDataMask);
    1742           0 :             CPLFree(papoDstBands);
    1743           0 :             return CE_Failure;
    1744             :         }
    1745             : 
    1746         265 :         int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
    1747         265 :                            ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
    1748         265 :         int nBlocksDone = 0;
    1749             : 
    1750             :         int nDstYOff;
    1751         560 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1752         295 :              nDstYOff += nDstBlockYSize)
    1753             :         {
    1754             :             int nDstYCount;
    1755         265 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1756         267 :                 nDstYCount = nDstBlockYSize;
    1757             :             else
    1758           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1759             : 
    1760         265 :             int nChunkYOff =
    1761         265 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1762         265 :             int nChunkYOff2 = nYOff + 1 +
    1763         265 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1764             :                                                     dfYRatioDstToSrc));
    1765         265 :             if (nChunkYOff2 > nRasterYSize)
    1766          56 :                 nChunkYOff2 = nRasterYSize;
    1767         265 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1768         265 :             CPLAssert(nYCount <= nFullResYChunk);
    1769             : 
    1770         265 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1771         265 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1772         265 :             if (nChunkYOffQueried < 0)
    1773             :             {
    1774          56 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1775          56 :                 nChunkYOffQueried = 0;
    1776             :             }
    1777         265 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1778          66 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1779         265 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1780             : 
    1781             :             int nDstXOff;
    1782         545 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1783         280 :                  nDstXOff += nDstBlockXSize)
    1784             :             {
    1785             :                 int nDstXCount;
    1786         250 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1787         257 :                     nDstXCount = nDstBlockXSize;
    1788             :                 else
    1789           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1790             : 
    1791         250 :                 int nChunkXOff =
    1792         250 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1793         250 :                 int nChunkXOff2 =
    1794         250 :                     nXOff + 1 +
    1795         250 :                     static_cast<int>(
    1796         250 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1797         250 :                 if (nChunkXOff2 > nRasterXSize)
    1798         137 :                     nChunkXOff2 = nRasterXSize;
    1799         250 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1800         250 :                 CPLAssert(nXCount <= nFullResXChunk);
    1801             : 
    1802         250 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1803         250 :                 int nChunkXSizeQueried =
    1804         250 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1805         250 :                 if (nChunkXOffQueried < 0)
    1806             :                 {
    1807         129 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1808         129 :                     nChunkXOffQueried = 0;
    1809             :                 }
    1810         250 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1811         138 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1812         250 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1813             : 
    1814         250 :                 bool bSkipResample = false;
    1815         250 :                 bool bNoDataMaskFullyOpaque = false;
    1816         250 :                 if (eErr == CE_None && bUseNoDataMask)
    1817             :                 {
    1818          55 :                     eErr = poMaskBand->RasterIO(
    1819             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1820             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1821             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1822             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1823             : 
    1824             :                     /* Optimizations if mask if fully opaque or transparent */
    1825          55 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1826          55 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1827          55 :                     int i = 1;  // Used after for.
    1828      123794 :                     for (; i < nPixels; i++)
    1829             :                     {
    1830      123777 :                         if (pabyChunkNoDataMask[i] != bVal)
    1831          38 :                             break;
    1832             :                     }
    1833          55 :                     if (i == nPixels)
    1834             :                     {
    1835          17 :                         if (bVal == 0)
    1836             :                         {
    1837          16 :                             GByte abyZero[16] = {0};
    1838          64 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1839             :                             {
    1840        2016 :                                 for (int j = 0; j < nDstYCount; j++)
    1841             :                                 {
    1842        1968 :                                     GDALCopyWords(
    1843             :                                         abyZero, GDT_Byte, 0,
    1844             :                                         static_cast<GByte *>(pData) +
    1845        1968 :                                             iBand * nBandSpace +
    1846        1968 :                                             nLineSpace * (j + nDstYOff) +
    1847        1968 :                                             nDstXOff * nPixelSpace,
    1848             :                                         eBufType, static_cast<int>(nPixelSpace),
    1849             :                                         nDstXCount);
    1850             :                                 }
    1851             :                             }
    1852          16 :                             bSkipResample = true;
    1853             :                         }
    1854             :                         else
    1855             :                         {
    1856           1 :                             bNoDataMaskFullyOpaque = true;
    1857             :                         }
    1858             :                     }
    1859             :                 }
    1860             : 
    1861         250 :                 if (!bSkipResample && eErr == CE_None)
    1862             :                 {
    1863             :                     /* Read the source buffers */
    1864         234 :                     eErr = RasterIO(
    1865             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1866             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1867             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1868             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1869             :                 }
    1870             : 
    1871             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1872             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1873             :                     eErr == CE_None)
    1874             :                 {
    1875             :                     eErr = pfnResampleFuncMultiBands(
    1876             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1877             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1878             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1879             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1880             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1881             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1882             :                         nChunkXSizeQueried,
    1883             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1884             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1885             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1886             :                         nDstYOff + nDestYOffVirtual,
    1887             :                         nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
    1888             :                         pszResampling, FALSE /*bHasNoData*/,
    1889             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1890             :                         eDataType);
    1891             :                 }
    1892             :                 else
    1893             : #endif
    1894             :                 {
    1895             :                     size_t nChunkBandOffset =
    1896         277 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1897         277 :                         nChunkYSizeQueried *
    1898         277 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1899        1188 :                     for (int i = 0;
    1900        1188 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1901             :                          i++)
    1902             :                     {
    1903         908 :                         const bool bPropagateNoData = false;
    1904         908 :                         void *pDstBuffer = nullptr;
    1905         908 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1906             :                         GDALRasterBand *poMEMBand =
    1907         908 :                             poMEMDS->GetRasterBand(i + 1);
    1908         909 :                         eErr = pfnResampleFunc(
    1909             :                             dfXRatioDstToSrc, dfYRatioDstToSrc,
    1910             :                             dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1911             :                             dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1912             :                             eWrkDataType,
    1913         909 :                             reinterpret_cast<GByte *>(pChunk) +
    1914         909 :                                 i * nChunkBandOffset,
    1915             :                             bNoDataMaskFullyOpaque ? nullptr
    1916             :                                                    : pabyChunkNoDataMask,
    1917         909 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1918             :                             nChunkXSizeQueried,
    1919         909 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1920             :                             nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1921         909 :                             nDstXOff + nDestXOffVirtual + nDstXCount,
    1922             :                             nDstYOff + nDestYOffVirtual,
    1923         909 :                             nDstYOff + nDestYOffVirtual + nDstYCount, poMEMBand,
    1924             :                             &pDstBuffer, &eDstBufferDataType, pszResampling,
    1925             :                             false /*bHasNoData*/, 0.0 /* dfNoDataValue */,
    1926             :                             nullptr /* color table*/, eDataType,
    1927             :                             bPropagateNoData);
    1928         910 :                         if (eErr == CE_None)
    1929             :                         {
    1930         910 :                             eErr = poMEMBand->RasterIO(
    1931             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1932             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1933             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1934             :                                 eDstBufferDataType, 0, 0, nullptr);
    1935             :                         }
    1936         910 :                         CPLFree(pDstBuffer);
    1937             :                     }
    1938             :                 }
    1939             : 
    1940         280 :                 nBlocksDone++;
    1941         282 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1942           2 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1943             :                                              "", psExtraArg->pProgressData))
    1944             :                 {
    1945           0 :                     eErr = CE_Failure;
    1946             :                 }
    1947             :             }
    1948             :         }
    1949             : 
    1950         295 :         CPLFree(pChunk);
    1951         280 :         CPLFree(pabyChunkNoDataMask);
    1952             :     }
    1953             : 
    1954         280 :     CPLFree(papoDstBands);
    1955         280 :     GDALClose(poMEMDS);
    1956             : 
    1957         280 :     return eErr;
    1958             : }
    1959             : 
    1960             : //! @endcond
    1961             : 
    1962             : /************************************************************************/
    1963             : /*                           GDALSwapWords()                            */
    1964             : /************************************************************************/
    1965             : 
    1966             : /**
    1967             :  * Byte swap words in-place.
    1968             :  *
    1969             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    1970             :  * a memory array.  No assumption is made that the words being swapped are
    1971             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    1972             :  * to determine if the current platform is big endian or little endian.  Use
    1973             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    1974             :  * of a function call.
    1975             :  *
    1976             :  * @param pData pointer to start of data buffer.
    1977             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    1978             :  * @param nWordCount the number of words to be swapped in this call.
    1979             :  * @param nWordSkip the byte offset from the start of one word to the start of
    1980             :  * the next. For packed buffers this is the same as nWordSize.
    1981             :  */
    1982             : 
    1983      390131 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    1984             :                                int nWordSkip)
    1985             : 
    1986             : {
    1987      390131 :     if (nWordCount > 0)
    1988      390131 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    1989             : 
    1990      390131 :     GByte *pabyData = static_cast<GByte *>(pData);
    1991             : 
    1992      390131 :     switch (nWordSize)
    1993             :     {
    1994        3603 :         case 1:
    1995        3603 :             break;
    1996             : 
    1997      378958 :         case 2:
    1998      378958 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    1999   117423000 :             for (int i = 0; i < nWordCount; i++)
    2000             :             {
    2001   117044000 :                 CPL_SWAP16PTR(pabyData);
    2002   117044000 :                 pabyData += nWordSkip;
    2003             :             }
    2004      378958 :             break;
    2005             : 
    2006        7087 :         case 4:
    2007        7087 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2008        7087 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2009             :             {
    2010    16177200 :                 for (int i = 0; i < nWordCount; i++)
    2011             :                 {
    2012    16170100 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2013             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2014    16170100 :                     pabyData += nWordSkip;
    2015        7084 :                 }
    2016             :             }
    2017             :             else
    2018             :             {
    2019           9 :                 for (int i = 0; i < nWordCount; i++)
    2020             :                 {
    2021           6 :                     CPL_SWAP32PTR(pabyData);
    2022           6 :                     pabyData += nWordSkip;
    2023             :                 }
    2024             :             }
    2025        7087 :             break;
    2026             : 
    2027         483 :         case 8:
    2028         483 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2029             : #ifdef CPL_HAS_GINT64
    2030         483 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2031             :             {
    2032       27418 :                 for (int i = 0; i < nWordCount; i++)
    2033             :                 {
    2034       26936 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2035             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2036       26936 :                     pabyData += nWordSkip;
    2037         482 :                 }
    2038             :             }
    2039             :             else
    2040             : #endif
    2041             :             {
    2042           3 :                 for (int i = 0; i < nWordCount; i++)
    2043             :                 {
    2044           2 :                     CPL_SWAP64PTR(pabyData);
    2045           2 :                     pabyData += nWordSkip;
    2046             :                 }
    2047             :             }
    2048         483 :             break;
    2049             : 
    2050           0 :         default:
    2051           0 :             CPLAssert(false);
    2052             :     }
    2053             : }
    2054             : 
    2055             : /************************************************************************/
    2056             : /*                           GDALSwapWordsEx()                          */
    2057             : /************************************************************************/
    2058             : 
    2059             : /**
    2060             :  * Byte swap words in-place.
    2061             :  *
    2062             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2063             :  * a memory array.  No assumption is made that the words being swapped are
    2064             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2065             :  * to determine if the current platform is big endian or little endian.  Use
    2066             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2067             :  * of a function call.
    2068             :  *
    2069             :  * @param pData pointer to start of data buffer.
    2070             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2071             :  * @param nWordCount the number of words to be swapped in this call.
    2072             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2073             :  * the next. For packed buffers this is the same as nWordSize.
    2074             :  * @since GDAL 2.1
    2075             :  */
    2076        4240 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2077             :                                  int nWordSkip)
    2078             : {
    2079        4240 :     GByte *pabyData = static_cast<GByte *>(pData);
    2080        8480 :     while (nWordCount)
    2081             :     {
    2082             :         // Pick-up a multiple of 8 as max chunk size.
    2083        4240 :         const int nWordCountSmall =
    2084        4240 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2085        4240 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2086        4240 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2087        4240 :         nWordCount -= nWordCountSmall;
    2088             :     }
    2089        4240 : }
    2090             : 
    2091             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2092             : namespace
    2093             : {
    2094             : 
    2095             : /************************************************************************/
    2096             : /*                           GDALCopyWordsT()                           */
    2097             : /************************************************************************/
    2098             : /**
    2099             :  * Template function, used to copy data from pSrcData into buffer
    2100             :  * pDstData, with stride nSrcPixelStride in the source data and
    2101             :  * stride nDstPixelStride in the destination data. This template can
    2102             :  * deal with the case where the input data type is real or complex and
    2103             :  * the output is real.
    2104             :  *
    2105             :  * @param pSrcData the source data buffer
    2106             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2107             :  *                      of interest.
    2108             :  * @param pDstData the destination buffer.
    2109             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2110             :  *                      interest.
    2111             :  * @param nWordCount the total number of pixel words to copy
    2112             :  *
    2113             :  * @code
    2114             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2115             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2116             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2117             :  * @endcode
    2118             :  * @note
    2119             :  * This is a private function, and should not be exposed outside of
    2120             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2121             :  */
    2122             : 
    2123             : template <class Tin, class Tout>
    2124    45979756 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2125             :                                          int nSrcPixelStride,
    2126             :                                          Tout *const CPL_RESTRICT pDstData,
    2127             :                                          int nDstPixelStride,
    2128             :                                          GPtrDiff_t nWordCount)
    2129             : {
    2130    45979756 :     decltype(nWordCount) nDstOffset = 0;
    2131             : 
    2132    45979756 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2133    45979756 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2134   465950773 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2135             :     {
    2136   419970998 :         const Tin tValue =
    2137   419970998 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2138   419970998 :         Tout *const pOutPixel =
    2139   419970998 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2140             : 
    2141   419970998 :         GDALCopyWord(tValue, *pOutPixel);
    2142             : 
    2143   419970998 :         nDstOffset += nDstPixelStride;
    2144             :     }
    2145    45979756 : }
    2146             : 
    2147             : template <class Tin, class Tout>
    2148    38163060 : static void inline GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2149             :                                   int nSrcPixelStride,
    2150             :                                   Tout *const CPL_RESTRICT pDstData,
    2151             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    2152             : {
    2153    38163060 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2154             :                           nWordCount);
    2155    38163060 : }
    2156             : 
    2157             : template <class Tin, class Tout>
    2158     6371626 : static void inline GDALCopyWordsT_8atatime(
    2159             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2160             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2161             :     GPtrDiff_t nWordCount)
    2162             : {
    2163     6371626 :     decltype(nWordCount) nDstOffset = 0;
    2164             : 
    2165     6371626 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2166     6371626 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2167     6371626 :     decltype(nWordCount) n = 0;
    2168     6371626 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2169             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2170             :     {
    2171    20977890 :         for (; n < nWordCount - 7; n += 8)
    2172             :         {
    2173    20793489 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2174    20793489 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2175    20793489 :             Tout *const pOutPixels =
    2176    20793489 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2177             : 
    2178    20793489 :             GDALCopy8Words(pInValues, pOutPixels);
    2179             : 
    2180    20790849 :             nDstOffset += 8 * nDstPixelStride;
    2181             :         }
    2182             :     }
    2183    13045747 :     for (; n < nWordCount; n++)
    2184             :     {
    2185     6674121 :         const Tin tValue =
    2186     6674121 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2187     6674121 :         Tout *const pOutPixel =
    2188     6674121 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2189             : 
    2190     6674121 :         GDALCopyWord(tValue, *pOutPixel);
    2191             : 
    2192     6676761 :         nDstOffset += nDstPixelStride;
    2193             :     }
    2194     6371626 : }
    2195             : 
    2196             : #if defined(__x86_64) || defined(_M_X64)
    2197             : 
    2198             : #include <emmintrin.h>
    2199             : 
    2200             : template <class Tout>
    2201       35614 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2202             :                               int nSrcPixelStride,
    2203             :                               Tout *const CPL_RESTRICT pDstData,
    2204             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2205             : {
    2206             :     static_assert(std::is_integral<Tout>::value &&
    2207             :                       sizeof(Tout) == sizeof(uint16_t),
    2208             :                   "Bad Tout");
    2209       35614 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2210             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2211             :     {
    2212       29163 :         decltype(nWordCount) n = 0;
    2213       29163 :         const __m128i xmm_zero = _mm_setzero_si128();
    2214       29163 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2215             :             reinterpret_cast<GByte *>(pDstData);
    2216      685905 :         for (; n < nWordCount - 15; n += 16)
    2217             :         {
    2218      656742 :             __m128i xmm = _mm_loadu_si128(
    2219      656742 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2220      656742 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2221      656742 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2222             :             _mm_storeu_si128(
    2223      656742 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2224             :             _mm_storeu_si128(
    2225      656742 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2226             :         }
    2227      100081 :         for (; n < nWordCount; n++)
    2228             :         {
    2229       70918 :             pDstData[n] = pSrcData[n];
    2230       29163 :         }
    2231             :     }
    2232             :     else
    2233             :     {
    2234        6451 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2235             :                               nDstPixelStride, nWordCount);
    2236             :     }
    2237       35614 : }
    2238             : 
    2239             : template <>
    2240       26001 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2241             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2242             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2243             : {
    2244       26001 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2245             :                              nDstPixelStride, nWordCount);
    2246       26001 : }
    2247             : 
    2248             : template <>
    2249        9613 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2250             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2251             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2252             : {
    2253        9613 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2254             :                              nDstPixelStride, nWordCount);
    2255        9613 : }
    2256             : 
    2257             : template <class Tout>
    2258    10235137 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2259             :                               int nSrcPixelStride,
    2260             :                               Tout *const CPL_RESTRICT pDstData,
    2261             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2262             : {
    2263             :     static_assert(std::is_integral<Tout>::value &&
    2264             :                       sizeof(Tout) == sizeof(uint32_t),
    2265             :                   "Bad Tout");
    2266    10235137 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2267             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2268             :     {
    2269     4807027 :         decltype(nWordCount) n = 0;
    2270     4807027 :         const __m128i xmm_zero = _mm_setzero_si128();
    2271     4807027 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2272             :             reinterpret_cast<GByte *>(pDstData);
    2273    54986451 :         for (; n < nWordCount - 15; n += 16)
    2274             :         {
    2275    50318224 :             __m128i xmm = _mm_loadu_si128(
    2276    50318224 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2277    50334824 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2278    50301824 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2279    50205624 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2280    50092424 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2281    50185424 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2282    50179424 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2283             :             _mm_storeu_si128(
    2284    50179424 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2285             :             _mm_storeu_si128(
    2286    50179424 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2287             :             _mm_storeu_si128(
    2288    50179424 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2289             :             _mm_storeu_si128(
    2290    50179424 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2291             :         }
    2292     8929282 :         for (; n < nWordCount; n++)
    2293             :         {
    2294     4261085 :             pDstData[n] = pSrcData[n];
    2295     4668197 :         }
    2296             :     }
    2297             :     else
    2298             :     {
    2299     5428080 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2300             :                               nDstPixelStride, nWordCount);
    2301             :     }
    2302    10096237 : }
    2303             : 
    2304             : template <>
    2305         437 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2306             :                     int nSrcPixelStride, GUInt32 *const CPL_RESTRICT pDstData,
    2307             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2308             : {
    2309         437 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2310             :                              nDstPixelStride, nWordCount);
    2311         437 : }
    2312             : 
    2313             : template <>
    2314    10234600 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2315             :                     int nSrcPixelStride, GInt32 *const CPL_RESTRICT pDstData,
    2316             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2317             : {
    2318    10234600 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2319             :                              nDstPixelStride, nWordCount);
    2320    10234700 : }
    2321             : 
    2322             : template <>
    2323     2466810 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2324             :                     int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
    2325             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2326             : {
    2327     2466810 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2328             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2329             :     {
    2330      108968 :         decltype(nWordCount) n = 0;
    2331      108968 :         const __m128i xmm_zero = _mm_setzero_si128();
    2332      108968 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2333             :             reinterpret_cast<GByte *>(pDstData);
    2334     3244400 :         for (; n < nWordCount - 15; n += 16)
    2335             :         {
    2336     3135430 :             __m128i xmm = _mm_loadu_si128(
    2337     3135430 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2338     3135430 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2339     3135430 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2340     3135430 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2341     3135430 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2342     3135430 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2343     3135430 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2344     3135430 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2345     3135430 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2346     3135430 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2347     3135430 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2348     3135430 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2349             :                           xmm0_f);
    2350             :             _mm_storeu_ps(
    2351     3135430 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2352             :             _mm_storeu_ps(
    2353     3135430 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2354             :             _mm_storeu_ps(
    2355     3135430 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2356             :         }
    2357      469776 :         for (; n < nWordCount; n++)
    2358             :         {
    2359      360808 :             pDstData[n] = pSrcData[n];
    2360      108968 :         }
    2361             :     }
    2362             :     else
    2363             :     {
    2364     2357840 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2365             :                               nDstPixelStride, nWordCount);
    2366             :     }
    2367     2466810 : }
    2368             : 
    2369             : template <>
    2370      142991 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2371             :                     int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
    2372             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2373             : {
    2374      142991 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2375             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2376             :     {
    2377      120161 :         decltype(nWordCount) n = 0;
    2378      120161 :         const __m128i xmm_zero = _mm_setzero_si128();
    2379      120161 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2380             :             reinterpret_cast<GByte *>(pDstData);
    2381     1392960 :         for (; n < nWordCount - 15; n += 16)
    2382             :         {
    2383     1272790 :             __m128i xmm = _mm_loadu_si128(
    2384     1272790 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2385     1272790 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2386     1272790 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2387     1272790 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2388     1272790 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2389     1272790 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2390     1272790 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2391             : 
    2392     1272790 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2393     1272790 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2394     1272790 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2395     1272790 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2396     1272790 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2397     1272790 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2398     1272790 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2399     1272790 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2400     1272790 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2401     1272790 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2402     1272790 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2403     1272790 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2404             : 
    2405     1272790 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2406             :                           xmm0_low_d);
    2407             :             _mm_storeu_pd(
    2408     1272790 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2409             :                 xmm0_high_d);
    2410             :             _mm_storeu_pd(
    2411     1272790 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2412             :                 xmm1_low_d);
    2413             :             _mm_storeu_pd(
    2414     1272790 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2415             :                 xmm1_high_d);
    2416             :             _mm_storeu_pd(
    2417     1272790 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2418             :                 xmm2_low_d);
    2419             :             _mm_storeu_pd(
    2420     1272790 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2421             :                 xmm2_high_d);
    2422             :             _mm_storeu_pd(
    2423     1272790 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2424             :                 xmm3_low_d);
    2425             :             _mm_storeu_pd(
    2426     1272790 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2427             :                 xmm3_high_d);
    2428             :         }
    2429      227242 :         for (; n < nWordCount; n++)
    2430             :         {
    2431      107081 :             pDstData[n] = pSrcData[n];
    2432      120161 :         }
    2433             :     }
    2434             :     else
    2435             :     {
    2436       22830 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2437             :                               nDstPixelStride, nWordCount);
    2438             :     }
    2439      142991 : }
    2440             : 
    2441             : template <>
    2442        6111 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2443             :                     int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
    2444             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2445             : {
    2446        6111 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2447             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2448             :     {
    2449        4936 :         decltype(nWordCount) n = 0;
    2450             :         // In SSE2, min_epu16 does not exist, so shift from
    2451             :         // UInt16 to SInt16 to be able to use min_epi16
    2452        4936 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2453        4936 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2454      138166 :         for (; n < nWordCount - 7; n += 8)
    2455             :         {
    2456      133230 :             __m128i xmm = _mm_loadu_si128(
    2457      133230 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2458      133230 :             xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
    2459      133230 :             xmm = _mm_min_epi16(xmm, xmm_m255_shifted);
    2460      133230 :             xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
    2461      133230 :             xmm = _mm_packus_epi16(xmm, xmm);
    2462      133230 :             GDALCopyXMMToInt64(xmm,
    2463      133230 :                                reinterpret_cast<GPtrDiff_t *>(pDstData + n));
    2464             :         }
    2465       15506 :         for (; n < nWordCount; n++)
    2466             :         {
    2467       10570 :             pDstData[n] =
    2468       10570 :                 pSrcData[n] >= 255 ? 255 : static_cast<GByte>(pSrcData[n]);
    2469        4936 :         }
    2470             :     }
    2471             :     else
    2472             :     {
    2473        1175 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2474             :                               nDstPixelStride, nWordCount);
    2475             :     }
    2476        6111 : }
    2477             : 
    2478             : template <>
    2479         221 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2480             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2481             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2482             : {
    2483         221 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2484             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2485             :     {
    2486          15 :         decltype(nWordCount) n = 0;
    2487             :         // In SSE2, min_epu16 does not exist, so shift from
    2488             :         // UInt16 to SInt16 to be able to use min_epi16
    2489          15 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2490          15 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2491          31 :         for (; n < nWordCount - 7; n += 8)
    2492             :         {
    2493          16 :             __m128i xmm = _mm_loadu_si128(
    2494          16 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2495          16 :             xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
    2496          16 :             xmm = _mm_min_epi16(xmm, xmm_32767_shifted);
    2497          16 :             xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
    2498          16 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm);
    2499             :         }
    2500          55 :         for (; n < nWordCount; n++)
    2501             :         {
    2502          40 :             pDstData[n] =
    2503          40 :                 pSrcData[n] >= 32767 ? 32767 : static_cast<GInt16>(pSrcData[n]);
    2504          15 :         }
    2505             :     }
    2506             :     else
    2507             :     {
    2508         206 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2509             :                               nDstPixelStride, nWordCount);
    2510             :     }
    2511         221 : }
    2512             : 
    2513             : template <>
    2514         349 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2515             :                     int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
    2516             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2517             : {
    2518         349 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2519             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2520             :     {
    2521         343 :         decltype(nWordCount) n = 0;
    2522         343 :         const __m128i xmm_zero = _mm_setzero_si128();
    2523         343 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2524             :             reinterpret_cast<GByte *>(pDstData);
    2525        1311 :         for (; n < nWordCount - 7; n += 8)
    2526             :         {
    2527         968 :             __m128i xmm = _mm_loadu_si128(
    2528         968 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2529         968 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2530         968 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2531         968 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2532         968 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2533         968 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2534             :                           xmm0_f);
    2535             :             _mm_storeu_ps(
    2536         968 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2537             :         }
    2538        1105 :         for (; n < nWordCount; n++)
    2539             :         {
    2540         762 :             pDstData[n] = pSrcData[n];
    2541         343 :         }
    2542             :     }
    2543             :     else
    2544             :     {
    2545           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2546             :                               nDstPixelStride, nWordCount);
    2547             :     }
    2548         349 : }
    2549             : 
    2550             : template <>
    2551         279 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2552             :                     int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
    2553             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2554             : {
    2555         279 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2556             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2557             :     {
    2558         171 :         decltype(nWordCount) n = 0;
    2559         171 :         const __m128i xmm_zero = _mm_setzero_si128();
    2560         171 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2561             :             reinterpret_cast<GByte *>(pDstData);
    2562         219 :         for (; n < nWordCount - 7; n += 8)
    2563             :         {
    2564          48 :             __m128i xmm = _mm_loadu_si128(
    2565          48 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2566          48 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2567          48 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2568             : 
    2569          48 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2570          48 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2571          48 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2572          48 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2573          48 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2574          48 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2575             : 
    2576          48 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2577             :                           xmm0_low_d);
    2578             :             _mm_storeu_pd(
    2579          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2580             :                 xmm0_high_d);
    2581             :             _mm_storeu_pd(
    2582          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2583             :                 xmm1_low_d);
    2584             :             _mm_storeu_pd(
    2585          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2586             :                 xmm1_high_d);
    2587             :         }
    2588         429 :         for (; n < nWordCount; n++)
    2589             :         {
    2590         258 :             pDstData[n] = pSrcData[n];
    2591         171 :         }
    2592             :     }
    2593             :     else
    2594             :     {
    2595         108 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2596             :                               nDstPixelStride, nWordCount);
    2597             :     }
    2598         279 : }
    2599             : 
    2600             : template <>
    2601         946 : void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    2602             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2603             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2604             : {
    2605         946 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2606             :                             nDstPixelStride, nWordCount);
    2607         946 : }
    2608             : 
    2609             : #endif  // defined(__x86_64) || defined(_M_X64)
    2610             : 
    2611             : template <>
    2612     2266830 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2613             :                     int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
    2614             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2615             : {
    2616     2266830 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2617             :                             nDstPixelStride, nWordCount);
    2618     2266830 : }
    2619             : 
    2620             : template <>
    2621     2933510 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2622             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2623             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2624             : {
    2625     2933510 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2626             :                             nDstPixelStride, nWordCount);
    2627     2933510 : }
    2628             : 
    2629             : template <>
    2630     1170340 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2631             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2632             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2633             : {
    2634     1170340 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2635             :                             nDstPixelStride, nWordCount);
    2636     1170340 : }
    2637             : 
    2638             : /************************************************************************/
    2639             : /*                   GDALCopyWordsComplexT()                            */
    2640             : /************************************************************************/
    2641             : /**
    2642             :  * Template function, used to copy data from pSrcData into buffer
    2643             :  * pDstData, with stride nSrcPixelStride in the source data and
    2644             :  * stride nDstPixelStride in the destination data. Deals with the
    2645             :  * complex case, where input is complex and output is complex.
    2646             :  *
    2647             :  * @param pSrcData the source data buffer
    2648             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2649             :  *                      of interest.
    2650             :  * @param pDstData the destination buffer.
    2651             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2652             :  *                      interest.
    2653             :  * @param nWordCount the total number of pixel words to copy
    2654             :  *
    2655             :  */
    2656             : template <class Tin, class Tout>
    2657      125244 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    2658             :                                   int nSrcPixelStride,
    2659             :                                   Tout *const CPL_RESTRICT pDstData,
    2660             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    2661             : {
    2662      125244 :     decltype(nWordCount) nDstOffset = 0;
    2663      125244 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2664      125244 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2665             : 
    2666     7339801 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2667             :     {
    2668     7214552 :         const Tin *const pPixelIn =
    2669     7214552 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    2670     7214552 :         Tout *const pPixelOut =
    2671     7214552 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2672             : 
    2673     7214552 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    2674     7214552 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    2675             : 
    2676     7214552 :         nDstOffset += nDstPixelStride;
    2677             :     }
    2678      125244 : }
    2679             : 
    2680             : /************************************************************************/
    2681             : /*                   GDALCopyWordsComplexOutT()                         */
    2682             : /************************************************************************/
    2683             : /**
    2684             :  * Template function, used to copy data from pSrcData into buffer
    2685             :  * pDstData, with stride nSrcPixelStride in the source data and
    2686             :  * stride nDstPixelStride in the destination data. Deals with the
    2687             :  * case where the value is real coming in, but complex going out.
    2688             :  *
    2689             :  * @param pSrcData the source data buffer
    2690             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2691             :  *                      of interest, in bytes.
    2692             :  * @param pDstData the destination buffer.
    2693             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2694             :  *                      interest, in bytes.
    2695             :  * @param nWordCount the total number of pixel words to copy
    2696             :  *
    2697             :  */
    2698             : template <class Tin, class Tout>
    2699        2640 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    2700             :                                      int nSrcPixelStride,
    2701             :                                      Tout *const CPL_RESTRICT pDstData,
    2702             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    2703             : {
    2704        2640 :     decltype(nWordCount) nDstOffset = 0;
    2705             : 
    2706        2640 :     const Tout tOutZero = static_cast<Tout>(0);
    2707             : 
    2708        2640 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2709        2640 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2710             : 
    2711     1079135 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2712             :     {
    2713     1076495 :         const Tin tValue =
    2714     1076495 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    2715     1076495 :         Tout *const pPixelOut =
    2716     1076495 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2717     1076495 :         GDALCopyWord(tValue, *pPixelOut);
    2718             : 
    2719     1076495 :         pPixelOut[1] = tOutZero;
    2720             : 
    2721     1076495 :         nDstOffset += nDstPixelStride;
    2722             :     }
    2723        2640 : }
    2724             : 
    2725             : /************************************************************************/
    2726             : /*                           GDALCopyWordsFromT()                       */
    2727             : /************************************************************************/
    2728             : /**
    2729             :  * Template driver function. Given the input type T, call the appropriate
    2730             :  * GDALCopyWordsT function template for the desired output type. You should
    2731             :  * never call this function directly (call GDALCopyWords instead).
    2732             :  *
    2733             :  * @param pSrcData source data buffer
    2734             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    2735             :  * @param bInComplex input is complex
    2736             :  * @param pDstData destination data buffer
    2737             :  * @param eDstType destination data type
    2738             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    2739             :  * @param nWordCount number of pixel words to be copied
    2740             :  */
    2741             : template <class T>
    2742    57550011 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    2743             :                                int nSrcPixelStride, bool bInComplex,
    2744             :                                void *CPL_RESTRICT pDstData,
    2745             :                                GDALDataType eDstType, int nDstPixelStride,
    2746             :                                GPtrDiff_t nWordCount)
    2747             : {
    2748    57550011 :     switch (eDstType)
    2749             :     {
    2750     6616272 :         case GDT_Byte:
    2751     6616272 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2752             :                            static_cast<unsigned char *>(pDstData),
    2753             :                            nDstPixelStride, nWordCount);
    2754     6616271 :             break;
    2755         439 :         case GDT_Int8:
    2756         439 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2757             :                            static_cast<signed char *>(pDstData),
    2758             :                            nDstPixelStride, nWordCount);
    2759         439 :             break;
    2760     1210410 :         case GDT_UInt16:
    2761     1210410 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2762             :                            static_cast<unsigned short *>(pDstData),
    2763             :                            nDstPixelStride, nWordCount);
    2764     1210410 :             break;
    2765     7041324 :         case GDT_Int16:
    2766     7041324 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2767             :                            static_cast<short *>(pDstData), nDstPixelStride,
    2768             :                            nWordCount);
    2769     7041324 :             break;
    2770        4262 :         case GDT_UInt32:
    2771        4262 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2772             :                            static_cast<unsigned int *>(pDstData),
    2773             :                            nDstPixelStride, nWordCount);
    2774        4262 :             break;
    2775    23458380 :         case GDT_Int32:
    2776    23458380 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2777             :                            static_cast<int *>(pDstData), nDstPixelStride,
    2778             :                            nWordCount);
    2779    23458480 :             break;
    2780         565 :         case GDT_UInt64:
    2781         565 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2782             :                            static_cast<std::uint64_t *>(pDstData),
    2783             :                            nDstPixelStride, nWordCount);
    2784         565 :             break;
    2785        4037 :         case GDT_Int64:
    2786        4037 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2787             :                            static_cast<std::int64_t *>(pDstData),
    2788             :                            nDstPixelStride, nWordCount);
    2789        4037 :             break;
    2790     3930331 :         case GDT_Float32:
    2791     3930331 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2792             :                            static_cast<float *>(pDstData), nDstPixelStride,
    2793             :                            nWordCount);
    2794     3930331 :             break;
    2795    15156079 :         case GDT_Float64:
    2796    15156079 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2797             :                            static_cast<double *>(pDstData), nDstPixelStride,
    2798             :                            nWordCount);
    2799    15156079 :             break;
    2800      122404 :         case GDT_CInt16:
    2801      122404 :             if (bInComplex)
    2802             :             {
    2803      121394 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2804             :                                       static_cast<short *>(pDstData),
    2805             :                                       nDstPixelStride, nWordCount);
    2806             :             }
    2807             :             else  // input is not complex, so we need to promote to a complex
    2808             :                   // buffer
    2809             :             {
    2810        1010 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2811             :                                          static_cast<short *>(pDstData),
    2812             :                                          nDstPixelStride, nWordCount);
    2813             :             }
    2814      122404 :             break;
    2815         783 :         case GDT_CInt32:
    2816         783 :             if (bInComplex)
    2817             :             {
    2818         395 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2819             :                                       static_cast<int *>(pDstData),
    2820             :                                       nDstPixelStride, nWordCount);
    2821             :             }
    2822             :             else  // input is not complex, so we need to promote to a complex
    2823             :                   // buffer
    2824             :             {
    2825         388 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2826             :                                          static_cast<int *>(pDstData),
    2827             :                                          nDstPixelStride, nWordCount);
    2828             :             }
    2829         783 :             break;
    2830        3268 :         case GDT_CFloat32:
    2831        3268 :             if (bInComplex)
    2832             :             {
    2833        2693 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2834             :                                       static_cast<float *>(pDstData),
    2835             :                                       nDstPixelStride, nWordCount);
    2836             :             }
    2837             :             else  // input is not complex, so we need to promote to a complex
    2838             :                   // buffer
    2839             :             {
    2840         575 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2841             :                                          static_cast<float *>(pDstData),
    2842             :                                          nDstPixelStride, nWordCount);
    2843             :             }
    2844        3268 :             break;
    2845        1429 :         case GDT_CFloat64:
    2846        1429 :             if (bInComplex)
    2847             :             {
    2848         762 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2849             :                                       static_cast<double *>(pDstData),
    2850             :                                       nDstPixelStride, nWordCount);
    2851             :             }
    2852             :             else  // input is not complex, so we need to promote to a complex
    2853             :                   // buffer
    2854             :             {
    2855         667 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2856             :                                          static_cast<double *>(pDstData),
    2857             :                                          nDstPixelStride, nWordCount);
    2858             :             }
    2859        1429 :             break;
    2860           0 :         case GDT_Unknown:
    2861             :         case GDT_TypeCount:
    2862           0 :             CPLAssert(false);
    2863             :     }
    2864    57550111 : }
    2865             : 
    2866             : }  // end anonymous namespace
    2867             : 
    2868             : /************************************************************************/
    2869             : /*                          GDALReplicateWord()                         */
    2870             : /************************************************************************/
    2871             : 
    2872             : template <class T>
    2873      228882 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    2874             :                                GPtrDiff_t nWordCount)
    2875             : {
    2876      228882 :     const T valSet = *static_cast<const T *>(pDstData);
    2877      228882 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    2878             :     {
    2879      200511 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    2880     9103956 :         while (nWordCount >= 4)
    2881             :         {
    2882     8903448 :             nWordCount -= 4;
    2883     8903448 :             pDstPtr[0] = valSet;
    2884     8903448 :             pDstPtr[1] = valSet;
    2885     8903448 :             pDstPtr[2] = valSet;
    2886     8903448 :             pDstPtr[3] = valSet;
    2887     8903448 :             pDstPtr += 4;
    2888             :         }
    2889      667980 :         while (nWordCount > 0)
    2890             :         {
    2891      467469 :             --nWordCount;
    2892      467469 :             *pDstPtr = valSet;
    2893      467469 :             pDstPtr++;
    2894             :         }
    2895             :     }
    2896             :     else
    2897             :     {
    2898       28371 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    2899      954250 :         while (nWordCount > 0)
    2900             :         {
    2901      925879 :             --nWordCount;
    2902      925879 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    2903      925879 :             pabyDstPtr += nDstPixelStride;
    2904             :         }
    2905             :     }
    2906      228882 : }
    2907             : 
    2908      590835 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    2909             :                               GDALDataType eSrcType,
    2910             :                               void *CPL_RESTRICT pDstData,
    2911             :                               GDALDataType eDstType, int nDstPixelStride,
    2912             :                               GPtrDiff_t nWordCount)
    2913             : {
    2914             :     /* -----------------------------------------------------------------------
    2915             :      */
    2916             :     /* Special case when the source data is always the same value */
    2917             :     /* (for VRTSourcedRasterBand::IRasterIO and
    2918             :      * VRTDerivedRasterBand::IRasterIO*/
    2919             :     /*  for example) */
    2920             :     /* -----------------------------------------------------------------------
    2921             :      */
    2922             :     // Let the general translation case do the necessary conversions
    2923             :     // on the first destination element.
    2924      590835 :     GDALCopyWords(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    2925             : 
    2926             :     // Now copy the first element to the nWordCount - 1 following destination
    2927             :     // elements.
    2928      590835 :     nWordCount--;
    2929      590835 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    2930             : 
    2931      590835 :     switch (eDstType)
    2932             :     {
    2933      361869 :         case GDT_Byte:
    2934             :         case GDT_Int8:
    2935             :         {
    2936      361869 :             if (nDstPixelStride == 1)
    2937             :             {
    2938      326917 :                 if (nWordCount > 0)
    2939      326917 :                     memset(pabyDstWord,
    2940      326917 :                            *reinterpret_cast<const GByte *>(pDstData),
    2941             :                            nWordCount);
    2942             :             }
    2943             :             else
    2944             :             {
    2945       34952 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    2946     5558530 :                 while (nWordCount > 0)
    2947             :                 {
    2948     5523580 :                     --nWordCount;
    2949     5523580 :                     *pabyDstWord = valSet;
    2950     5523580 :                     pabyDstWord += nDstPixelStride;
    2951             :                 }
    2952             :             }
    2953      361869 :             break;
    2954             :         }
    2955             : 
    2956             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    2957             :     case enum_type:                                                            \
    2958             :     {                                                                          \
    2959             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    2960             :         break;                                                                 \
    2961             :     }
    2962             : 
    2963         499 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    2964      170100 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    2965          90 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    2966         174 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    2967          15 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    2968          10 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    2969       52933 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    2970        5061 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    2971             : 
    2972             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    2973             :     case enum_type:                                                            \
    2974             :     {                                                                          \
    2975             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    2976             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    2977             :         while (nWordCount > 0)                                                 \
    2978             :         {                                                                      \
    2979             :             --nWordCount;                                                      \
    2980             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    2981             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    2982             :             pabyDstWord += nDstPixelStride;                                    \
    2983             :         }                                                                      \
    2984             :         break;                                                                 \
    2985             :     }
    2986             : 
    2987         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    2988         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    2989         784 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    2990         784 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    2991             : 
    2992           0 :         case GDT_Unknown:
    2993             :         case GDT_TypeCount:
    2994           0 :             CPLAssert(false);
    2995             :     }
    2996      590835 : }
    2997             : 
    2998             : /************************************************************************/
    2999             : /*                        GDALUnrolledCopy()                            */
    3000             : /************************************************************************/
    3001             : 
    3002             : template <class T, int srcStride, int dstStride>
    3003     2871108 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3004             :                                            const T *CPL_RESTRICT pSrc,
    3005             :                                            GPtrDiff_t nIters)
    3006             : {
    3007     2871108 :     if (nIters >= 16)
    3008             :     {
    3009   129237696 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3010             :         {
    3011   126496244 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3012   126496244 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3013   126496244 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3014   126496244 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3015   126496244 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3016   126496244 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3017   126496244 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3018   126496244 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3019   126496244 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3020   126496244 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3021   126496244 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3022   126496244 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3023   126496244 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3024   126496244 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3025   126496244 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3026   126496244 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3027   126496244 :             pDest += 16 * dstStride;
    3028   126496244 :             pSrc += 16 * srcStride;
    3029             :         }
    3030     2741412 :         nIters = nIters % 16;
    3031             :     }
    3032     5164971 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3033             :     {
    3034     2293863 :         pDest[i * dstStride] = *pSrc;
    3035     2293863 :         pSrc += srcStride;
    3036             :     }
    3037     2871108 : }
    3038             : 
    3039             : template <class T, int srcStride, int dstStride>
    3040     2865911 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3041             :                                     const T *CPL_RESTRICT pSrc,
    3042             :                                     GPtrDiff_t nIters)
    3043             : {
    3044     2865911 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3045     2865920 : }
    3046             : 
    3047             : #if (defined(__x86_64) || defined(_M_X64))
    3048             : 
    3049             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3050             : 
    3051             : #include "rasterio_ssse3.h"
    3052             : 
    3053             : #endif
    3054             : 
    3055             : template <>
    3056      306527 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3057             :                                    const GByte *CPL_RESTRICT pSrc,
    3058             :                                    GPtrDiff_t nIters)
    3059             : {
    3060      306527 :     decltype(nIters) i = 0;
    3061      306527 :     if (nIters > 16)
    3062             :     {
    3063      148367 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3064             :         // If we were sure that there would always be 1 trailing byte, we could
    3065             :         // check against nIters - 15
    3066     2584570 :         for (; i < nIters - 16; i += 16)
    3067             :         {
    3068             :             __m128i xmm0 =
    3069     2436210 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3070             :             __m128i xmm1 =
    3071     4872410 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3072             :             // Set higher 8bit of each int16 packed word to 0
    3073     2436210 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3074     2436210 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3075             :             // Pack int16 to uint8 and merge back both vector
    3076     2436210 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3077             : 
    3078             :             // Store result
    3079     2436210 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3080             : 
    3081     2436210 :             pSrc += 2 * 16;
    3082             :         }
    3083             :     }
    3084     3918520 :     for (; i < nIters; i++)
    3085             :     {
    3086     3612000 :         pDest[i] = *pSrc;
    3087     3612000 :         pSrc += 2;
    3088             :     }
    3089      306527 : }
    3090             : 
    3091             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3092             : 
    3093             : template <>
    3094      184387 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3095             :                                    const GByte *CPL_RESTRICT pSrc,
    3096             :                                    GPtrDiff_t nIters)
    3097             : {
    3098      184387 :     if (nIters > 16 && CPLHaveRuntimeSSSE3())
    3099             :     {
    3100      179190 :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3101             :     }
    3102             :     else
    3103             :     {
    3104        5197 :         GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
    3105             :     }
    3106      184387 : }
    3107             : 
    3108             : #endif
    3109             : 
    3110             : template <>
    3111       96503 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    3112             :                                    const GByte *CPL_RESTRICT pSrc,
    3113             :                                    GPtrDiff_t nIters)
    3114             : {
    3115       96503 :     decltype(nIters) i = 0;
    3116       96503 :     if (nIters > 16)
    3117             :     {
    3118       91210 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    3119             :         // If we were sure that there would always be 3 trailing bytes, we could
    3120             :         // check against nIters - 15
    3121     3754210 :         for (; i < nIters - 16; i += 16)
    3122             :         {
    3123             :             __m128i xmm0 =
    3124     3663000 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3125             :             __m128i xmm1 =
    3126     3663000 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3127             :             __m128i xmm2 =
    3128     3663000 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3129             :             __m128i xmm3 =
    3130     7326000 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    3131             :             // Set higher 24bit of each int32 packed word to 0
    3132     3663000 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3133     3663000 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3134     3663000 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    3135     3663000 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    3136             :             // Pack int32 to int16
    3137     3663000 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    3138     3663000 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    3139             :             // Pack int16 to uint8
    3140     3663000 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    3141             : 
    3142             :             // Store result
    3143     3663000 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3144             : 
    3145     3663000 :             pSrc += 4 * 16;
    3146             :         }
    3147             :     }
    3148     1054780 :     for (; i < nIters; i++)
    3149             :     {
    3150      958275 :         pDest[i] = *pSrc;
    3151      958275 :         pSrc += 4;
    3152             :     }
    3153       96503 : }
    3154             : #endif  // defined(__x86_64) || defined(_M_X64)
    3155             : 
    3156             : /************************************************************************/
    3157             : /*                         GDALFastCopy()                               */
    3158             : /************************************************************************/
    3159             : 
    3160             : template <class T>
    3161    33109600 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    3162             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    3163             :                                 GPtrDiff_t nIters)
    3164             : {
    3165    33109600 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    3166    33109600 :     if (nIters == 1)
    3167             :     {
    3168    20744110 :         *pDest = *pSrc;
    3169             :     }
    3170    12365387 :     else if (nDestStride == sizeofT)
    3171             :     {
    3172     9467942 :         if (nSrcStride == sizeofT)
    3173             :         {
    3174     8822625 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    3175             :         }
    3176      645317 :         else if (nSrcStride == 2 * sizeofT)
    3177             :         {
    3178      309468 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    3179             :         }
    3180      335849 :         else if (nSrcStride == 3 * sizeofT)
    3181             :         {
    3182      197087 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    3183             :         }
    3184      138762 :         else if (nSrcStride == 4 * sizeofT)
    3185             :         {
    3186      125371 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    3187             :         }
    3188             :         else
    3189             :         {
    3190    13005720 :             while (nIters-- > 0)
    3191             :             {
    3192    12992330 :                 *pDest = *pSrc;
    3193    12992330 :                 pSrc += nSrcStride / sizeofT;
    3194    12992330 :                 pDest++;
    3195             :             }
    3196             :         }
    3197             :     }
    3198     2897495 :     else if (nSrcStride == sizeofT)
    3199             :     {
    3200     2880619 :         if (nDestStride == 2 * sizeofT)
    3201             :         {
    3202      128599 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    3203             :         }
    3204     2752020 :         else if (nDestStride == 3 * sizeofT)
    3205             :         {
    3206     2062171 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    3207             :         }
    3208      689853 :         else if (nDestStride == 4 * sizeofT)
    3209             :         {
    3210      630632 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    3211             :         }
    3212             :         else
    3213             :         {
    3214    12625700 :             while (nIters-- > 0)
    3215             :             {
    3216    12566480 :                 *pDest = *pSrc;
    3217    12566480 :                 pSrc++;
    3218    12566480 :                 pDest += nDestStride / sizeofT;
    3219             :             }
    3220             :         }
    3221             :     }
    3222             :     else
    3223             :     {
    3224     1115759 :         while (nIters-- > 0)
    3225             :         {
    3226     1098888 :             *pDest = *pSrc;
    3227     1098888 :             pSrc += nSrcStride / sizeofT;
    3228     1098888 :             pDest += nDestStride / sizeofT;
    3229             :         }
    3230             :     }
    3231    33109600 : }
    3232             : 
    3233             : /************************************************************************/
    3234             : /*                         GDALFastCopyByte()                           */
    3235             : /************************************************************************/
    3236             : 
    3237      276277 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    3238             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    3239             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    3240             : {
    3241      276277 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    3242             :                  nWordCount);
    3243      276277 : }
    3244             : 
    3245             : /************************************************************************/
    3246             : /*                           GDALCopyWords()                            */
    3247             : /************************************************************************/
    3248             : 
    3249             : /**
    3250             :  * Copy pixel words from buffer to buffer.
    3251             :  *
    3252             :  * @see GDALCopyWords64()
    3253             :  */
    3254   101000000 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    3255             :                                GDALDataType eSrcType, int nSrcPixelStride,
    3256             :                                void *CPL_RESTRICT pDstData,
    3257             :                                GDALDataType eDstType, int nDstPixelStride,
    3258             :                                int nWordCount)
    3259             : {
    3260   101000000 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    3261             :                     nDstPixelStride, nWordCount);
    3262   100999000 : }
    3263             : 
    3264             : /************************************************************************/
    3265             : /*                          GDALCopyWords64()                           */
    3266             : /************************************************************************/
    3267             : 
    3268             : /**
    3269             :  * Copy pixel words from buffer to buffer.
    3270             :  *
    3271             :  * This function is used to copy pixel word values from one memory buffer
    3272             :  * to another, with support for conversion between data types, and differing
    3273             :  * step factors.  The data type conversion is done using the normal GDAL
    3274             :  * rules.  Values assigned to a lower range integer type are clipped.  For
    3275             :  * instance assigning GDT_Int16 values to a GDT_Byte buffer will cause values
    3276             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    3277             :  * Assignment from floating point to integer uses default C type casting
    3278             :  * semantics.   Assignment from non-complex to complex will result in the
    3279             :  * imaginary part being set to zero on output.  Assignment from complex to
    3280             :  * non-complex will result in the complex portion being lost and the real
    3281             :  * component being preserved (<i>not magnitude!</i>).
    3282             :  *
    3283             :  * No assumptions are made about the source or destination words occurring
    3284             :  * on word boundaries.  It is assumed that all values are in native machine
    3285             :  * byte order.
    3286             :  *
    3287             :  * @param pSrcData Pointer to source data to be converted.
    3288             :  * @param eSrcType the source data type (see GDALDataType enum)
    3289             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    3290             :  * in bytes
    3291             :  * @param pDstData Pointer to buffer where destination data should go
    3292             :  * @param eDstType the destination data type (see GDALDataType enum)
    3293             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    3294             :  * words), in bytes
    3295             :  * @param nWordCount number of words to be copied
    3296             :  *
    3297             :  * @note
    3298             :  * When adding a new data type to GDAL, you must do the following to
    3299             :  * support it properly within the GDALCopyWords function:
    3300             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    3301             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    3302             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    3303             :  *    This should call the appropriate GDALCopyWordsT template.
    3304             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    3305             :  *    above namespace. This will ensure that any conversion issues are
    3306             :  *    handled (cases like the float -> int32 case, where the min/max)
    3307             :  *    values are subject to roundoff error.
    3308             :  */
    3309             : 
    3310   105270000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    3311             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    3312             :                                  void *CPL_RESTRICT pDstData,
    3313             :                                  GDALDataType eDstType, int nDstPixelStride,
    3314             :                                  GPtrDiff_t nWordCount)
    3315             : 
    3316             : {
    3317             :     // On platforms where alignment matters, be careful
    3318   105270000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    3319   105269000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    3320   105269000 :     assert(nSrcDataTypeSize != 0);
    3321   105269000 :     assert(nDstDataTypeSize != 0);
    3322   105269000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    3323    61458800 :         ((reinterpret_cast<GPtrDiff_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    3324    61459200 :          (reinterpret_cast<GPtrDiff_t>(pDstData) % nDstDataTypeSize) != 0 ||
    3325    61458900 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    3326    61458900 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    3327             :     {
    3328         605 :         if (eSrcType == eDstType)
    3329             :         {
    3330       19500 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3331             :             {
    3332       19000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3333             :                        static_cast<const GByte *>(pSrcData) +
    3334       19000 :                            nSrcPixelStride * i,
    3335             :                        nDstDataTypeSize);
    3336             :             }
    3337             :         }
    3338             :         else
    3339             :         {
    3340             : #define ALIGN_PTR(ptr, align)                                                  \
    3341             :     ((ptr) + ((align) - (reinterpret_cast<size_t>(ptr) % (align))) % (align))
    3342             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    3343             :             // be sure to get correctly aligned pointer.
    3344             :             GByte abySrcBuffer[32];
    3345             :             GByte abyDstBuffer[32];
    3346         105 :             GByte *pabySrcBuffer = ALIGN_PTR(abySrcBuffer, nSrcDataTypeSize);
    3347         105 :             GByte *pabyDstBuffer = ALIGN_PTR(abyDstBuffer, nDstDataTypeSize);
    3348        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3349             :             {
    3350        3255 :                 memcpy(pabySrcBuffer,
    3351             :                        static_cast<const GByte *>(pSrcData) +
    3352        3255 :                            nSrcPixelStride * i,
    3353             :                        nSrcDataTypeSize);
    3354        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    3355             :                                 eDstType, 0, 1);
    3356        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3357             :                        pabyDstBuffer, nDstDataTypeSize);
    3358             :             }
    3359             :         }
    3360         605 :         return;
    3361             :     }
    3362             : 
    3363             :     // Deal with the case where we're replicating a single word into the
    3364             :     // provided buffer
    3365   105269000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    3366             :     {
    3367      590835 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    3368             :                           nDstPixelStride, nWordCount);
    3369      590835 :         return;
    3370             :     }
    3371             : 
    3372   104678000 :     if (eSrcType == eDstType)
    3373             :     {
    3374    47282000 :         if (eSrcType == GDT_Byte || eSrcType == GDT_Int8)
    3375             :         {
    3376    13506700 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    3377             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    3378             :                          nWordCount);
    3379    13506200 :             return;
    3380             :         }
    3381             : 
    3382    33775300 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    3383    19326500 :             (nDstPixelStride % 2) == 0)
    3384             :         {
    3385    19326500 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    3386             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    3387             :                          nWordCount);
    3388    19325700 :             return;
    3389             :         }
    3390             : 
    3391    14448800 :         if (nWordCount == 1)
    3392             :         {
    3393             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    3394             :             // Avoid false positives...
    3395             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    3396             : #else
    3397    14040200 :             if (nSrcDataTypeSize == 2)
    3398           0 :                 memcpy(pDstData, pSrcData, 2);
    3399    14040200 :             else if (nSrcDataTypeSize == 4)
    3400    14015600 :                 memcpy(pDstData, pSrcData, 4);
    3401       24641 :             else if (nSrcDataTypeSize == 8)
    3402       24519 :                 memcpy(pDstData, pSrcData, 8);
    3403             :             else /* if( eSrcType == GDT_CFloat64 ) */
    3404         122 :                 memcpy(pDstData, pSrcData, 16);
    3405             : #endif
    3406    14040200 :             return;
    3407             :         }
    3408             : 
    3409             :         // Let memcpy() handle the case where we're copying a packed buffer
    3410             :         // of pixels.
    3411      408616 :         if (nSrcPixelStride == nDstPixelStride)
    3412             :         {
    3413      257146 :             if (nSrcPixelStride == nSrcDataTypeSize)
    3414             :             {
    3415      254976 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    3416      254976 :                 return;
    3417             :             }
    3418             :         }
    3419             :     }
    3420             : 
    3421             :     // Handle the more general case -- deals with conversion of data types
    3422             :     // directly.
    3423    57549500 :     switch (eSrcType)
    3424             :     {
    3425    12883100 :         case GDT_Byte:
    3426    12883100 :             GDALCopyWordsFromT<unsigned char>(
    3427             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    3428             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3429    12883100 :             break;
    3430         852 :         case GDT_Int8:
    3431         852 :             GDALCopyWordsFromT<signed char>(
    3432             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    3433             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3434         852 :             break;
    3435       49726 :         case GDT_UInt16:
    3436       49726 :             GDALCopyWordsFromT<unsigned short>(
    3437             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    3438             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3439       49726 :             break;
    3440     4520700 :         case GDT_Int16:
    3441     4520700 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3442             :                                       nSrcPixelStride, false, pDstData,
    3443             :                                       eDstType, nDstPixelStride, nWordCount);
    3444     4520700 :             break;
    3445        5733 :         case GDT_UInt32:
    3446        5733 :             GDALCopyWordsFromT<unsigned int>(
    3447             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    3448             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3449        5733 :             break;
    3450    12254600 :         case GDT_Int32:
    3451    12254600 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3452             :                                     nSrcPixelStride, false, pDstData, eDstType,
    3453             :                                     nDstPixelStride, nWordCount);
    3454    12254600 :             break;
    3455         482 :         case GDT_UInt64:
    3456         482 :             GDALCopyWordsFromT<std::uint64_t>(
    3457             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    3458             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3459         482 :             break;
    3460        6908 :         case GDT_Int64:
    3461        6908 :             GDALCopyWordsFromT<std::int64_t>(
    3462             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    3463             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3464        6908 :             break;
    3465     6496430 :         case GDT_Float32:
    3466     6496430 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3467             :                                       nSrcPixelStride, false, pDstData,
    3468             :                                       eDstType, nDstPixelStride, nWordCount);
    3469     6496440 :             break;
    3470    20590300 :         case GDT_Float64:
    3471    20590300 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3472             :                                        nSrcPixelStride, false, pDstData,
    3473             :                                        eDstType, nDstPixelStride, nWordCount);
    3474    20590300 :             break;
    3475      566959 :         case GDT_CInt16:
    3476      566959 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3477             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3478             :                                       nDstPixelStride, nWordCount);
    3479      566959 :             break;
    3480         405 :         case GDT_CInt32:
    3481         405 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3482             :                                     nSrcPixelStride, true, pDstData, eDstType,
    3483             :                                     nDstPixelStride, nWordCount);
    3484         405 :             break;
    3485        1317 :         case GDT_CFloat32:
    3486        1317 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3487             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3488             :                                       nDstPixelStride, nWordCount);
    3489        1317 :             break;
    3490      172481 :         case GDT_CFloat64:
    3491      172481 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3492             :                                        nSrcPixelStride, true, pDstData,
    3493             :                                        eDstType, nDstPixelStride, nWordCount);
    3494      172481 :             break;
    3495           0 :         case GDT_Unknown:
    3496             :         case GDT_TypeCount:
    3497           0 :             CPLAssert(false);
    3498             :     }
    3499             : }
    3500             : 
    3501             : /************************************************************************/
    3502             : /*                            GDALCopyBits()                            */
    3503             : /************************************************************************/
    3504             : 
    3505             : /**
    3506             :  * Bitwise word copying.
    3507             :  *
    3508             :  * A function for moving sets of partial bytes around.  Loosely
    3509             :  * speaking this is a bitwise analog to GDALCopyWords().
    3510             :  *
    3511             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    3512             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    3513             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    3514             :  * and nDstOffset are the offset into the source and destination buffers
    3515             :  * to start at, also measured in bits.
    3516             :  *
    3517             :  * All bit offsets are assumed to start from the high order bit in a byte
    3518             :  * (i.e. most significant bit first).  Currently this function is not very
    3519             :  * optimized, but it may be improved for some common cases in the future
    3520             :  * as needed.
    3521             :  *
    3522             :  * @param pabySrcData the source data buffer.
    3523             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    3524             :  * first word to copy.
    3525             :  * @param nSrcStep the offset in bits from the start one source word to the
    3526             :  * start of the next.
    3527             :  * @param pabyDstData the destination data buffer.
    3528             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    3529             :  * first word to copy over.
    3530             :  * @param nDstStep the offset in bits from the start one word to the
    3531             :  * start of the next.
    3532             :  * @param nBitCount the number of bits in a word to be copied.
    3533             :  * @param nStepCount the number of words to copy.
    3534             :  */
    3535             : 
    3536           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    3537             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    3538             :                   int nBitCount, int nStepCount)
    3539             : 
    3540             : {
    3541           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    3542             : 
    3543           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    3544             :     {
    3545           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    3546             :         {
    3547           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    3548           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    3549             :             else
    3550           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    3551             : 
    3552           0 :             nSrcOffset++;
    3553           0 :             nDstOffset++;
    3554             :         }
    3555             : 
    3556           0 :         nSrcOffset += (nSrcStep - nBitCount);
    3557           0 :         nDstOffset += (nDstStep - nBitCount);
    3558             :     }
    3559             : }
    3560             : 
    3561             : /************************************************************************/
    3562             : /*                    GDALGetBestOverviewLevel()                        */
    3563             : /*                                                                      */
    3564             : /* Returns the best overview level to satisfy the query or -1 if none   */
    3565             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    3566             : /* returning a valid overview level                                     */
    3567             : /************************************************************************/
    3568             : 
    3569           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    3570             :                                  int &nXSize, int &nYSize, int nBufXSize,
    3571             :                                  int nBufYSize)
    3572             : {
    3573           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    3574           0 :                                          nBufXSize, nBufYSize, nullptr);
    3575             : }
    3576             : 
    3577      323324 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    3578             :                                   int &nYOff, int &nXSize, int &nYSize,
    3579             :                                   int nBufXSize, int nBufYSize,
    3580             :                                   GDALRasterIOExtraArg *psExtraArg)
    3581             : {
    3582             :     /* -------------------------------------------------------------------- */
    3583             :     /*      Compute the desired downsampling factor.  It is                 */
    3584             :     /*      based on the least reduced axis, and represents the number      */
    3585             :     /*      of source pixels to one destination pixel.                      */
    3586             :     /* -------------------------------------------------------------------- */
    3587      323324 :     const double dfDesiredDownsamplingFactor =
    3588      323324 :         ((nXSize / static_cast<double>(nBufXSize)) <
    3589      160986 :              (nYSize / static_cast<double>(nBufYSize)) ||
    3590             :          nBufYSize == 1)
    3591      355200 :             ? nXSize / static_cast<double>(nBufXSize)
    3592      129110 :             : nYSize / static_cast<double>(nBufYSize);
    3593             : 
    3594             :     /* -------------------------------------------------------------------- */
    3595             :     /*      Find the overview level that largest downsampling factor (most  */
    3596             :     /*      downsampled) that is still less than (or only a little more)    */
    3597             :     /*      downsampled than the request.                                   */
    3598             :     /* -------------------------------------------------------------------- */
    3599      323324 :     const int nOverviewCount = poBand->GetOverviewCount();
    3600      323324 :     GDALRasterBand *poBestOverview = nullptr;
    3601      323324 :     double dfBestDownsamplingFactor = 0;
    3602      323324 :     int nBestOverviewLevel = -1;
    3603             : 
    3604             :     const char *pszOversampligThreshold =
    3605      323324 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    3606             : 
    3607             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    3608             :     // Do not exactly use a oversampling threshold of 1.0 because of numerical
    3609             :     // instability.
    3610      323324 :     const auto AdjustThreshold = [](double x)
    3611             :     {
    3612      323324 :         constexpr double EPS = 1e-2;
    3613      323324 :         return x == 1.0 ? x + EPS : x;
    3614             :     };
    3615      646648 :     const double dfOversamplingThreshold = AdjustThreshold(
    3616           9 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    3617      323315 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    3618      646630 :             ? 1.0
    3619             :             : 1.2);
    3620      342234 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    3621             :     {
    3622       18910 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    3623       37820 :         if (poOverview == nullptr ||
    3624       37818 :             poOverview->GetXSize() > poBand->GetXSize() ||
    3625       18908 :             poOverview->GetYSize() > poBand->GetYSize())
    3626             :         {
    3627           2 :             continue;
    3628             :         }
    3629             : 
    3630             :         // Compute downsampling factor of this overview
    3631             :         const double dfDownsamplingFactor = std::min(
    3632       18908 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    3633       37816 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    3634             : 
    3635             :         // Is it nearly the requested factor and better (lower) than
    3636             :         // the current best factor?
    3637       26565 :         if (dfDownsamplingFactor >=
    3638       18908 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold ||
    3639             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    3640             :         {
    3641        7657 :             continue;
    3642             :         }
    3643             : 
    3644             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    3645       11251 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    3646             : 
    3647       11251 :         if (pszResampling != nullptr &&
    3648          65 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    3649          16 :             continue;
    3650             : 
    3651             :         // OK, this is our new best overview.
    3652       11235 :         poBestOverview = poOverview;
    3653       11235 :         nBestOverviewLevel = iOverview;
    3654       11235 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    3655             :     }
    3656             : 
    3657             :     /* -------------------------------------------------------------------- */
    3658             :     /*      If we didn't find an overview that helps us, just return        */
    3659             :     /*      indicating failure and the full resolution image will be used.  */
    3660             :     /* -------------------------------------------------------------------- */
    3661      323324 :     if (nBestOverviewLevel < 0)
    3662      319937 :         return -1;
    3663             : 
    3664             :     /* -------------------------------------------------------------------- */
    3665             :     /*      Recompute the source window in terms of the selected            */
    3666             :     /*      overview.                                                       */
    3667             :     /* -------------------------------------------------------------------- */
    3668             :     const double dfXFactor =
    3669        3387 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    3670             :     const double dfYFactor =
    3671        3387 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    3672        3387 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    3673             :              poBestOverview->GetYSize());
    3674             : 
    3675       10161 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    3676        3387 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    3677       10161 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    3678        3387 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    3679        3387 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    3680        3387 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    3681        3387 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    3682           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    3683        3387 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    3684           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    3685             : 
    3686        3387 :     if (psExtraArg)
    3687             :     {
    3688        3387 :         if (psExtraArg->bFloatingPointWindowValidity)
    3689             :         {
    3690         537 :             psExtraArg->dfXOff /= dfXFactor;
    3691         537 :             psExtraArg->dfXSize /= dfXFactor;
    3692         537 :             psExtraArg->dfYOff /= dfYFactor;
    3693         537 :             psExtraArg->dfYSize /= dfYFactor;
    3694             :         }
    3695        2850 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    3696             :         {
    3697          15 :             psExtraArg->bFloatingPointWindowValidity = true;
    3698          15 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    3699          15 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    3700          15 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    3701          15 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    3702             :         }
    3703             :     }
    3704             : 
    3705        3387 :     nXOff = nOXOff;
    3706        3387 :     nYOff = nOYOff;
    3707        3387 :     nXSize = nOXSize;
    3708        3387 :     nYSize = nOYSize;
    3709             : 
    3710        3387 :     return nBestOverviewLevel;
    3711             : }
    3712             : 
    3713             : /************************************************************************/
    3714             : /*                          OverviewRasterIO()                          */
    3715             : /*                                                                      */
    3716             : /*      Special work function to utilize available overviews to         */
    3717             : /*      more efficiently satisfy downsampled requests.  It will         */
    3718             : /*      return CE_Failure if there are no appropriate overviews         */
    3719             : /*      available but it doesn't emit any error messages.               */
    3720             : /************************************************************************/
    3721             : 
    3722             : //! @cond Doxygen_Suppress
    3723           1 : CPLErr GDALRasterBand::OverviewRasterIO(
    3724             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3725             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3726             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    3727             : 
    3728             : {
    3729             :     GDALRasterIOExtraArg sExtraArg;
    3730           1 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3731             : 
    3732           1 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    3733             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    3734           1 :     if (nOverview < 0)
    3735           1 :         return CE_Failure;
    3736             : 
    3737             :     /* -------------------------------------------------------------------- */
    3738             :     /*      Recast the call in terms of the new raster layer.               */
    3739             :     /* -------------------------------------------------------------------- */
    3740           0 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    3741           0 :     if (poOverviewBand == nullptr)
    3742           0 :         return CE_Failure;
    3743             : 
    3744           0 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    3745             :                                     pData, nBufXSize, nBufYSize, eBufType,
    3746           0 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    3747             : }
    3748             : 
    3749             : /************************************************************************/
    3750             : /*                      TryOverviewRasterIO()                           */
    3751             : /************************************************************************/
    3752             : 
    3753      161929 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    3754             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3755             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3756             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    3757             :     int *pbTried)
    3758             : {
    3759      161929 :     int nXOffMod = nXOff;
    3760      161929 :     int nYOffMod = nYOff;
    3761      161929 :     int nXSizeMod = nXSize;
    3762      161929 :     int nYSizeMod = nYSize;
    3763             :     GDALRasterIOExtraArg sExtraArg;
    3764             : 
    3765      161929 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3766             : 
    3767      161929 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    3768             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    3769             :         &sExtraArg);
    3770             : 
    3771      161929 :     if (iOvrLevel >= 0)
    3772             :     {
    3773          42 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    3774          42 :         if (poOverviewBand)
    3775             :         {
    3776          42 :             *pbTried = TRUE;
    3777          42 :             return poOverviewBand->RasterIO(
    3778             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    3779             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    3780          42 :                 &sExtraArg);
    3781             :         }
    3782             :     }
    3783             : 
    3784      161887 :     *pbTried = FALSE;
    3785      161887 :     return CE_None;
    3786             : }
    3787             : 
    3788             : /************************************************************************/
    3789             : /*                      TryOverviewRasterIO()                           */
    3790             : /************************************************************************/
    3791             : 
    3792      158057 : CPLErr GDALDataset::TryOverviewRasterIO(
    3793             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3794             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3795             :     int nBandCount, int *panBandMap, GSpacing nPixelSpace, GSpacing nLineSpace,
    3796             :     GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg, int *pbTried)
    3797             : {
    3798      158057 :     int nXOffMod = nXOff;
    3799      158057 :     int nYOffMod = nYOff;
    3800      158057 :     int nXSizeMod = nXSize;
    3801      158057 :     int nYSizeMod = nYSize;
    3802             :     GDALRasterIOExtraArg sExtraArg;
    3803      158057 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3804             : 
    3805      316114 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    3806      158057 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    3807             :         nBufYSize, &sExtraArg);
    3808             : 
    3809      158093 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    3810          36 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    3811             :     {
    3812          36 :         *pbTried = TRUE;
    3813          36 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    3814             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    3815             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    3816          36 :             nLineSpace, nBandSpace, &sExtraArg);
    3817             :     }
    3818             :     else
    3819             :     {
    3820      158021 :         *pbTried = FALSE;
    3821      158021 :         return CE_None;
    3822             :     }
    3823             : }
    3824             : 
    3825             : /************************************************************************/
    3826             : /*                        GetBestOverviewLevel()                        */
    3827             : /*                                                                      */
    3828             : /* Returns the best overview level to satisfy the query or -1 if none   */
    3829             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    3830             : /* overview level                                                       */
    3831             : /************************************************************************/
    3832             : 
    3833           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    3834             :                                            int &nYOff, int &nXSize, int &nYSize,
    3835             :                                            int nBufXSize, int nBufYSize,
    3836             :                                            int nBandCount, int *panBandMap,
    3837             :                                            GDALRasterIOExtraArg *psExtraArg)
    3838             : {
    3839           4 :     int nOverviewCount = 0;
    3840           4 :     GDALRasterBand *poFirstBand = nullptr;
    3841             : 
    3842             :     /* -------------------------------------------------------------------- */
    3843             :     /* Check that all bands have the same number of overviews and           */
    3844             :     /* that they have all the same size and block dimensions                */
    3845             :     /* -------------------------------------------------------------------- */
    3846          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    3847             :     {
    3848           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    3849           8 :         if (poBand == nullptr)
    3850           0 :             return -1;
    3851           8 :         if (iBand == 0)
    3852             :         {
    3853           4 :             poFirstBand = poBand;
    3854           4 :             nOverviewCount = poBand->GetOverviewCount();
    3855             :         }
    3856           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    3857             :         {
    3858           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    3859             :                              "mismatched overview count, use std method.");
    3860           0 :             return -1;
    3861             :         }
    3862             :         else
    3863             :         {
    3864           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    3865             :             {
    3866           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    3867             :                 GDALRasterBand *poOvrFirstBand =
    3868           0 :                     poFirstBand->GetOverview(iOverview);
    3869           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    3870           0 :                     continue;
    3871             : 
    3872           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    3873           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    3874             :                 {
    3875           0 :                     CPLDebug("GDAL",
    3876             :                              "GDALDataset::GetBestOverviewLevel() ... "
    3877             :                              "mismatched overview sizes, use std method.");
    3878           0 :                     return -1;
    3879             :                 }
    3880           0 :                 int nBlockXSizeFirst = 0;
    3881           0 :                 int nBlockYSizeFirst = 0;
    3882           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    3883             :                                              &nBlockYSizeFirst);
    3884             : 
    3885           0 :                 int nBlockXSizeCurrent = 0;
    3886           0 :                 int nBlockYSizeCurrent = 0;
    3887           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    3888             :                                         &nBlockYSizeCurrent);
    3889             : 
    3890           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    3891           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    3892             :                 {
    3893           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    3894             :                                      "mismatched block sizes, use std method.");
    3895           0 :                     return -1;
    3896             :                 }
    3897             :             }
    3898             :         }
    3899             :     }
    3900           4 :     if (poFirstBand == nullptr)
    3901           0 :         return -1;
    3902             : 
    3903           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    3904             :                                          nYSize, nBufXSize, nBufYSize,
    3905           4 :                                          psExtraArg);
    3906             : }
    3907             : 
    3908             : /************************************************************************/
    3909             : /*                         BlockBasedRasterIO()                         */
    3910             : /*                                                                      */
    3911             : /*      This convenience function implements a dataset level            */
    3912             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    3913             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    3914             : /*      all bands at once, so that a format driver that handles a       */
    3915             : /*      request for different bands of the same block efficiently       */
    3916             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    3917             : /*                                                                      */
    3918             : /*      This method is intended to be called by an overridden           */
    3919             : /*      IRasterIO() method in the driver specific GDALDataset           */
    3920             : /*      derived class.                                                  */
    3921             : /*                                                                      */
    3922             : /*      Default internal implementation of RasterIO() ... utilizes      */
    3923             : /*      the Block access methods to satisfy the request.  This would    */
    3924             : /*      normally only be overridden by formats with overviews.          */
    3925             : /*                                                                      */
    3926             : /*      To keep things relatively simple, this method does not          */
    3927             : /*      currently take advantage of some special cases addressed in     */
    3928             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    3929             : /*      call it when you know it will help.  That is in cases where     */
    3930             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    3931             : /*      implementing interleaved IO efficiently on a block by block     */
    3932             : /*      basis. Overviews will be used when possible.                    */
    3933             : /************************************************************************/
    3934             : 
    3935       63011 : CPLErr GDALDataset::BlockBasedRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
    3936             :                                        int nXSize, int nYSize, void *pData,
    3937             :                                        int nBufXSize, int nBufYSize,
    3938             :                                        GDALDataType eBufType, int nBandCount,
    3939             :                                        int *panBandMap, GSpacing nPixelSpace,
    3940             :                                        GSpacing nLineSpace, GSpacing nBandSpace,
    3941             :                                        GDALRasterIOExtraArg *psExtraArg)
    3942             : 
    3943             : {
    3944       63011 :     CPLAssert(nullptr != pData);
    3945             : 
    3946       63011 :     GByte **papabySrcBlock = nullptr;
    3947       63011 :     GDALRasterBlock *poBlock = nullptr;
    3948       63011 :     GDALRasterBlock **papoBlocks = nullptr;
    3949       63011 :     int nLBlockX = -1;
    3950       63011 :     int nLBlockY = -1;
    3951             :     int iBufYOff;
    3952             :     int iBufXOff;
    3953       63011 :     int nBlockXSize = 1;
    3954       63011 :     int nBlockYSize = 1;
    3955       63011 :     CPLErr eErr = CE_None;
    3956       63011 :     GDALDataType eDataType = GDT_Byte;
    3957             : 
    3958       63011 :     const bool bUseIntegerRequestCoords =
    3959       63038 :         (!psExtraArg->bFloatingPointWindowValidity ||
    3960          27 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    3961          25 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    3962             : 
    3963             :     /* -------------------------------------------------------------------- */
    3964             :     /*      Ensure that all bands share a common block size and data type.  */
    3965             :     /* -------------------------------------------------------------------- */
    3966      298651 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    3967             :     {
    3968      235640 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    3969             : 
    3970      235639 :         if (iBand == 0)
    3971             :         {
    3972       63009 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    3973       63011 :             eDataType = poBand->GetRasterDataType();
    3974             :         }
    3975             :         else
    3976             :         {
    3977      172630 :             int nThisBlockXSize = 0;
    3978      172630 :             int nThisBlockYSize = 0;
    3979      172630 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    3980      172630 :             if (nThisBlockXSize != nBlockXSize ||
    3981      172630 :                 nThisBlockYSize != nBlockYSize)
    3982             :             {
    3983           0 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    3984             :                                  "mismatched block sizes, use std method.");
    3985           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    3986             :                                          pData, nBufXSize, nBufYSize, eBufType,
    3987             :                                          nBandCount, panBandMap, nPixelSpace,
    3988           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    3989             :             }
    3990             : 
    3991      172630 :             if (eDataType != poBand->GetRasterDataType() &&
    3992           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    3993             :             {
    3994           1 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    3995             :                                  "mismatched band data types, use std method.");
    3996           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    3997             :                                          pData, nBufXSize, nBufYSize, eBufType,
    3998             :                                          nBandCount, panBandMap, nPixelSpace,
    3999           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4000             :             }
    4001             :         }
    4002             :     }
    4003             : 
    4004             :     /* ==================================================================== */
    4005             :     /*      In this special case at full resolution we step through in      */
    4006             :     /*      blocks, turning the request over to the per-band                */
    4007             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4008             :     /*      called before proceeding to the next.                           */
    4009             :     /* ==================================================================== */
    4010             : 
    4011       63011 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4012             :     {
    4013             :         GDALRasterIOExtraArg sDummyExtraArg;
    4014       63007 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4015             : 
    4016       63007 :         int nChunkYSize = 0;
    4017       63007 :         int nChunkXSize = 0;
    4018             : 
    4019      176766 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4020             :         {
    4021      114785 :             const int nChunkYOff = iBufYOff + nYOff;
    4022      114785 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4023      114785 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4024       58468 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4025             : 
    4026      753467 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4027             :             {
    4028      639707 :                 const int nChunkXOff = iBufXOff + nXOff;
    4029      639707 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4030      639707 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4031       71265 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4032             : 
    4033      639707 :                 GByte *pabyChunkData =
    4034      639707 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4035      639707 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4036             : 
    4037     3144200 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4038             :                 {
    4039     2505520 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4040             : 
    4041     5011030 :                     eErr = poBand->GDALRasterBand::IRasterIO(
    4042             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4043             :                         nChunkYSize,
    4044     2505520 :                         pabyChunkData +
    4045     2505520 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4046             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4047             :                         nLineSpace, &sDummyExtraArg);
    4048     2505520 :                     if (eErr != CE_None)
    4049        1024 :                         return eErr;
    4050             :                 }
    4051             :             }
    4052             : 
    4053      133008 :             if (psExtraArg->pfnProgress != nullptr &&
    4054       19248 :                 !psExtraArg->pfnProgress(
    4055      133008 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4056             :                         nBufYSize,
    4057             :                     "", psExtraArg->pProgressData))
    4058             :             {
    4059           1 :                 return CE_Failure;
    4060             :             }
    4061             :         }
    4062             : 
    4063       61981 :         return CE_None;
    4064             :     }
    4065             : 
    4066             :     /* Below code is not compatible with that case. It would need a complete */
    4067             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    4068           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    4069             :     {
    4070           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4071             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4072             :                                  panBandMap, nPixelSpace, nLineSpace,
    4073           0 :                                  nBandSpace, psExtraArg);
    4074             :     }
    4075             : 
    4076             :     /* We could have a smarter implementation, but that will do for now */
    4077           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    4078           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    4079             :     {
    4080           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4081             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4082             :                                  panBandMap, nPixelSpace, nLineSpace,
    4083           0 :                                  nBandSpace, psExtraArg);
    4084             :     }
    4085             : 
    4086             :     /* ==================================================================== */
    4087             :     /*      Loop reading required source blocks to satisfy output           */
    4088             :     /*      request.  This is the most general implementation.              */
    4089             :     /* ==================================================================== */
    4090             : 
    4091           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    4092             : 
    4093             :     papabySrcBlock =
    4094           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    4095             :     papoBlocks =
    4096           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    4097             : 
    4098             :     /* -------------------------------------------------------------------- */
    4099             :     /*      Select an overview level if appropriate.                        */
    4100             :     /* -------------------------------------------------------------------- */
    4101             : 
    4102             :     GDALRasterIOExtraArg sExtraArg;
    4103           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4104           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    4105             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    4106             :         panBandMap, &sExtraArg);
    4107           4 :     if (nOverviewLevel >= 0)
    4108             :     {
    4109           2 :         GetRasterBand(panBandMap[0])
    4110           2 :             ->GetOverview(nOverviewLevel)
    4111           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4112             :     }
    4113             : 
    4114           4 :     double dfXOff = nXOff;
    4115           4 :     double dfYOff = nYOff;
    4116           4 :     double dfXSize = nXSize;
    4117           4 :     double dfYSize = nYSize;
    4118           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    4119             :     {
    4120           2 :         dfXOff = sExtraArg.dfXOff;
    4121           2 :         dfYOff = sExtraArg.dfYOff;
    4122           2 :         dfXSize = sExtraArg.dfXSize;
    4123           2 :         dfYSize = sExtraArg.dfYSize;
    4124             :     }
    4125             : 
    4126             :     /* -------------------------------------------------------------------- */
    4127             :     /*      Compute stepping increment.                                     */
    4128             :     /* -------------------------------------------------------------------- */
    4129           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    4130           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    4131             : 
    4132           4 :     constexpr double EPS = 1e-10;
    4133             :     /* -------------------------------------------------------------------- */
    4134             :     /*      Loop over buffer computing source locations.                    */
    4135             :     /* -------------------------------------------------------------------- */
    4136          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    4137             :     {
    4138             :         GPtrDiff_t iSrcOffset;
    4139             : 
    4140             :         // Add small epsilon to avoid some numeric precision issues.
    4141          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    4142          32 :         const int iSrcY = static_cast<int>(std::min(
    4143          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    4144             : 
    4145          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    4146             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    4147             : 
    4148         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    4149             :         {
    4150         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    4151         270 :             const int iSrcX = static_cast<int>(std::min(
    4152         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    4153             : 
    4154             :             // FIXME: this code likely doesn't work if the dirty block gets
    4155             :             // flushed to disk before being completely written. In the meantime,
    4156             :             // bJustInitialize should probably be set to FALSE even if it is not
    4157             :             // ideal performance wise, and for lossy compression
    4158             : 
    4159             :             /* --------------------------------------------------------------------
    4160             :              */
    4161             :             /*      Ensure we have the appropriate block loaded. */
    4162             :             /* --------------------------------------------------------------------
    4163             :              */
    4164         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    4165         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    4166         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    4167         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    4168             :             {
    4169           4 :                 nLBlockX = iSrcX / nBlockXSize;
    4170           4 :                 nLBlockY = iSrcY / nBlockYSize;
    4171             : 
    4172           4 :                 const bool bJustInitialize =
    4173           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    4174           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    4175           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    4176           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    4177             :                 /*bool bMemZeroBuffer = FALSE;
    4178             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    4179             :                     nXOff <= nLBlockX * nBlockXSize &&
    4180             :                     nYOff <= nLBlockY * nBlockYSize &&
    4181             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    4182             :                      (nXOff + nXSize == GetRasterXSize() &&
    4183             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    4184             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    4185             :                      (nYOff + nYSize == GetRasterYSize() &&
    4186             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    4187             :                 {
    4188             :                     bJustInitialize = TRUE;
    4189             :                     bMemZeroBuffer = TRUE;
    4190             :                 }*/
    4191          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4192             :                 {
    4193           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4194           8 :                     if (nOverviewLevel >= 0)
    4195           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    4196           8 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    4197             :                                                         bJustInitialize);
    4198           8 :                     if (poBlock == nullptr)
    4199             :                     {
    4200           0 :                         eErr = CE_Failure;
    4201           0 :                         goto CleanupAndReturn;
    4202             :                     }
    4203             : 
    4204           8 :                     if (eRWFlag == GF_Write)
    4205           0 :                         poBlock->MarkDirty();
    4206             : 
    4207           8 :                     if (papoBlocks[iBand] != nullptr)
    4208           0 :                         papoBlocks[iBand]->DropLock();
    4209             : 
    4210           8 :                     papoBlocks[iBand] = poBlock;
    4211             : 
    4212           8 :                     papabySrcBlock[iBand] =
    4213           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    4214             :                     /*if( bMemZeroBuffer )
    4215             :                     {
    4216             :                         memset(papabySrcBlock[iBand], 0,
    4217             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    4218             :                     * nBlockYSize);
    4219             :                     }*/
    4220             :                 }
    4221             :             }
    4222             : 
    4223             :             /* --------------------------------------------------------------------
    4224             :              */
    4225             :             /*      Copy over this pixel of data. */
    4226             :             /* --------------------------------------------------------------------
    4227             :              */
    4228         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    4229         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    4230         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    4231         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    4232         270 :                               nBlockXSize) *
    4233         270 :                          nBandDataSize;
    4234             : 
    4235         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    4236             :             {
    4237         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    4238         710 :                 GPtrDiff_t iBandBufOffset =
    4239         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    4240             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    4241             : 
    4242         710 :                 if (eDataType == eBufType)
    4243             :                 {
    4244         710 :                     if (eRWFlag == GF_Read)
    4245         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    4246         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    4247             :                     else
    4248           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    4249             :                                static_cast<const GByte *>(pData) +
    4250           0 :                                    iBandBufOffset,
    4251             :                                nBandDataSize);
    4252             :                 }
    4253             :                 else
    4254             :                 {
    4255             :                     /* type to type conversion ... ouch, this is expensive way
    4256             :                        of handling single words */
    4257             : 
    4258           0 :                     if (eRWFlag == GF_Read)
    4259           0 :                         GDALCopyWords(pabySrcBlock + iSrcOffset, eDataType, 0,
    4260             :                                       static_cast<GByte *>(pData) +
    4261           0 :                                           iBandBufOffset,
    4262             :                                       eBufType, 0, 1);
    4263             :                     else
    4264           0 :                         GDALCopyWords(static_cast<const GByte *>(pData) +
    4265           0 :                                           iBandBufOffset,
    4266           0 :                                       eBufType, 0, pabySrcBlock + iSrcOffset,
    4267             :                                       eDataType, 0, 1);
    4268             :                 }
    4269             :             }
    4270             : 
    4271         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    4272             :         }
    4273             :     }
    4274             : 
    4275             :     /* -------------------------------------------------------------------- */
    4276             :     /*      CleanupAndReturn.                                               */
    4277             :     /* -------------------------------------------------------------------- */
    4278           4 : CleanupAndReturn:
    4279           4 :     CPLFree(papabySrcBlock);
    4280           4 :     if (papoBlocks != nullptr)
    4281             :     {
    4282          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    4283             :         {
    4284           8 :             if (papoBlocks[iBand] != nullptr)
    4285           8 :                 papoBlocks[iBand]->DropLock();
    4286             :         }
    4287           4 :         CPLFree(papoBlocks);
    4288             :     }
    4289             : 
    4290           4 :     return eErr;
    4291             : }
    4292             : 
    4293             : //! @endcond
    4294             : 
    4295             : /************************************************************************/
    4296             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    4297             : /************************************************************************/
    4298             : 
    4299        1948 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    4300             :                                             GDALRasterBand *poDstPrototypeBand,
    4301             :                                             int nBandCount,
    4302             :                                             int bDstIsCompressed,
    4303             :                                             int bInterleave, int *pnSwathCols,
    4304             :                                             int *pnSwathLines)
    4305             : {
    4306        1948 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4307        1948 :     int nSrcBlockXSize = 0;
    4308        1948 :     int nSrcBlockYSize = 0;
    4309        1948 :     int nBlockXSize = 0;
    4310        1948 :     int nBlockYSize = 0;
    4311             : 
    4312        1948 :     int nXSize = poSrcPrototypeBand->GetXSize();
    4313        1948 :     int nYSize = poSrcPrototypeBand->GetYSize();
    4314             : 
    4315        1948 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    4316        1948 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4317             : 
    4318        1948 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    4319        1948 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    4320             : 
    4321        1948 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4322        1948 :     if (bInterleave)
    4323         508 :         nPixelSize *= nBandCount;
    4324             : 
    4325             :     // aim for one row of blocks.  Do not settle for less.
    4326        1948 :     int nSwathCols = nXSize;
    4327        1948 :     int nSwathLines = nMaxBlockYSize;
    4328             : 
    4329             :     const char *pszSrcCompression =
    4330        1948 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4331        1948 :     if (pszSrcCompression == nullptr)
    4332             :     {
    4333        1922 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    4334        1922 :         if (poSrcDS)
    4335             :             pszSrcCompression =
    4336        1916 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4337             :     }
    4338             : 
    4339             :     /* -------------------------------------------------------------------- */
    4340             :     /*      What will our swath size be?                                    */
    4341             :     /* -------------------------------------------------------------------- */
    4342             :     // When writing interleaved data in a compressed format, we want to be sure
    4343             :     // that each block will only be written once, so the swath size must not be
    4344             :     // greater than the block cache.
    4345        1948 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    4346             :     int nTargetSwathSize;
    4347        1948 :     if (pszSwathSize != nullptr)
    4348           0 :         nTargetSwathSize = static_cast<int>(
    4349           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    4350             :     else
    4351             :     {
    4352             :         // As a default, take one 1/4 of the cache size.
    4353        1948 :         nTargetSwathSize = static_cast<int>(
    4354        1948 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    4355             : 
    4356             :         // but if the minimum idal swath buf size is less, then go for it to
    4357             :         // avoid unnecessarily abusing RAM usage.
    4358             :         // but try to use 10 MB at least.
    4359        1948 :         GIntBig nIdealSwathBufSize =
    4360        1948 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    4361        1948 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    4362             : 
    4363        1948 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    4364        1948 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    4365             :         {
    4366           2 :             nMinTargetSwathSize = nTargetSwathSize;
    4367             :         }
    4368             : 
    4369        1948 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    4370        1938 :             nIdealSwathBufSize < nMinTargetSwathSize)
    4371             :         {
    4372        1935 :             nIdealSwathBufSize = nMinTargetSwathSize;
    4373             :         }
    4374             : 
    4375        1948 :         if (pszSrcCompression != nullptr &&
    4376         153 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    4377           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    4378           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    4379             :         {
    4380           2 :             nIdealSwathBufSize =
    4381           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    4382           2 :                                                  nSrcBlockYSize * nPixelSize);
    4383             :         }
    4384        1948 :         if (nTargetSwathSize > nIdealSwathBufSize)
    4385        1934 :             nTargetSwathSize = static_cast<int>(
    4386        1934 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    4387             :     }
    4388             : 
    4389        1948 :     if (nTargetSwathSize < 1000000)
    4390           8 :         nTargetSwathSize = 1000000;
    4391             : 
    4392             :     /* But let's check that  */
    4393        2148 :     if (bDstIsCompressed && bInterleave &&
    4394         200 :         nTargetSwathSize > GDALGetCacheMax64())
    4395             :     {
    4396           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4397             :                  "When translating into a compressed interleave format, "
    4398             :                  "the block cache size (" CPL_FRMT_GIB ") "
    4399             :                  "should be at least the size of the swath (%d) "
    4400             :                  "(GDAL_SWATH_SIZE config. option)",
    4401             :                  GDALGetCacheMax64(), nTargetSwathSize);
    4402             :     }
    4403             : 
    4404             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    4405             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    4406             : 
    4407             :     // if both input and output datasets are tiled, that the tile dimensions
    4408             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    4409             :     // of input and output block dimensions.
    4410        1948 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    4411          32 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    4412          32 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    4413          32 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4414          32 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4415             :     {
    4416          32 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    4417          32 :                 nPixelSize <=
    4418          32 :             static_cast<GIntBig>(nTargetSwathSize))
    4419             :         {
    4420          32 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    4421          32 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    4422          32 :             if (nSwathCols == 0)
    4423           0 :                 nSwathCols = nMaxBlockXSize;
    4424          32 :             if (nSwathCols > nXSize)
    4425          30 :                 nSwathCols = nXSize;
    4426          32 :             nSwathLines = nMaxBlockYSize;
    4427             : 
    4428          32 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    4429          32 :                 static_cast<GIntBig>(nTargetSwathSize))
    4430             :             {
    4431           0 :                 nSwathCols = nXSize;
    4432           0 :                 nSwathLines = nBlockYSize;
    4433             :             }
    4434             :         }
    4435             :     }
    4436             : 
    4437        1948 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    4438        1948 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    4439        1948 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    4440             :     {
    4441           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    4442           1 :         if (nSwathLines == 0)
    4443           1 :             nSwathLines = 1;
    4444             : 
    4445           1 :         CPLDebug(
    4446             :             "GDAL",
    4447             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    4448             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    4449             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    4450           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    4451             :     }
    4452             :     // If we are processing single scans, try to handle several at once.
    4453             :     // If we are handling swaths already, only grow the swath if a row
    4454             :     // of blocks is substantially less than our target buffer size.
    4455        1947 :     else if (nSwathLines == 1 ||
    4456        1458 :              nMemoryPerCol * nSwathLines <
    4457        1458 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    4458             :     {
    4459        1920 :         nSwathLines = std::min(
    4460             :             nYSize,
    4461        1920 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    4462             : 
    4463             :         /* If possible try to align to source and target block height */
    4464        1920 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    4465         187 :             nSwathLines > nMaxBlockYSize &&
    4466         187 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4467         159 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4468         139 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    4469             :     }
    4470             : 
    4471        1948 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    4472           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    4473           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    4474             :     {
    4475             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    4476           2 :         if (nSwathLines < nSrcBlockYSize)
    4477             :         {
    4478           0 :             nSwathLines = nSrcBlockYSize;
    4479             : 
    4480             :             // Number of pixels that can be read/write simultaneously.
    4481           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    4482           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    4483           0 :             if (nSwathCols == 0)
    4484           0 :                 nSwathCols = nSrcBlockXSize;
    4485           0 :             if (nSwathCols > nXSize)
    4486           0 :                 nSwathCols = nXSize;
    4487             : 
    4488           0 :             CPLDebug(
    4489             :                 "GDAL",
    4490             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    4491             :                 "too high block, "
    4492             :                 "use partial width at one time");
    4493             :         }
    4494           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    4495             :         {
    4496             :             /* Round on a multiple of nSrcBlockYSize */
    4497           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    4498           0 :             CPLDebug(
    4499             :                 "GDAL",
    4500             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    4501             :                 "round nSwathLines to block height : %d",
    4502             :                 nSwathLines);
    4503             :         }
    4504             :     }
    4505        1946 :     else if (bDstIsCompressed)
    4506             :     {
    4507         355 :         if (nSwathLines < nBlockYSize)
    4508             :         {
    4509         137 :             nSwathLines = nBlockYSize;
    4510             : 
    4511             :             // Number of pixels that can be read/write simultaneously.
    4512         137 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    4513         137 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    4514         137 :             if (nSwathCols == 0)
    4515           0 :                 nSwathCols = nBlockXSize;
    4516         137 :             if (nSwathCols > nXSize)
    4517         137 :                 nSwathCols = nXSize;
    4518             : 
    4519         137 :             CPLDebug(
    4520             :                 "GDAL",
    4521             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    4522             :                 "too high block, "
    4523             :                 "use partial width at one time");
    4524             :         }
    4525         218 :         else if ((nSwathLines % nBlockYSize) != 0)
    4526             :         {
    4527             :             // Round on a multiple of nBlockYSize.
    4528           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    4529           9 :             CPLDebug(
    4530             :                 "GDAL",
    4531             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    4532             :                 "round nSwathLines to block height : %d",
    4533             :                 nSwathLines);
    4534             :         }
    4535             :     }
    4536             : 
    4537        1948 :     *pnSwathCols = nSwathCols;
    4538        1948 :     *pnSwathLines = nSwathLines;
    4539        1948 : }
    4540             : 
    4541             : /************************************************************************/
    4542             : /*                     GDALDatasetCopyWholeRaster()                     */
    4543             : /************************************************************************/
    4544             : 
    4545             : /**
    4546             :  * \brief Copy all dataset raster data.
    4547             :  *
    4548             :  * This function copies the complete raster contents of one dataset to
    4549             :  * another similarly configured dataset.  The source and destination
    4550             :  * dataset must have the same number of bands, and the same width
    4551             :  * and height.  The bands do not have to have the same data type.
    4552             :  *
    4553             :  * This function is primarily intended to support implementation of
    4554             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    4555             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    4556             :  * performing the transfer in a pixel interleaved fashion.
    4557             :  *
    4558             :  * Currently the only papszOptions value supported are :
    4559             :  * <ul>
    4560             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    4561             :  * write access pattern (this does not modify the layout of the destination
    4562             :  * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
    4563             :  * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
    4564             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    4565             :  * (GDAL &gt;= 2.2)</li>
    4566             :  * </ul>
    4567             :  * More options may be supported in the future.
    4568             :  *
    4569             :  * @param hSrcDS the source dataset
    4570             :  * @param hDstDS the destination dataset
    4571             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    4572             :  * @param pfnProgress progress reporting function.
    4573             :  * @param pProgressData callback data for progress function.
    4574             :  *
    4575             :  * @return CE_None on success, or CE_Failure on failure.
    4576             :  */
    4577             : 
    4578        1930 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    4579             :                                               GDALDatasetH hDstDS,
    4580             :                                               CSLConstList papszOptions,
    4581             :                                               GDALProgressFunc pfnProgress,
    4582             :                                               void *pProgressData)
    4583             : 
    4584             : {
    4585        1930 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    4586        1930 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    4587             : 
    4588        1930 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    4589        1930 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    4590             : 
    4591        1930 :     if (pfnProgress == nullptr)
    4592           0 :         pfnProgress = GDALDummyProgress;
    4593             : 
    4594             :     /* -------------------------------------------------------------------- */
    4595             :     /*      Confirm the datasets match in size and band counts.             */
    4596             :     /* -------------------------------------------------------------------- */
    4597        1930 :     const int nXSize = poDstDS->GetRasterXSize();
    4598        1930 :     const int nYSize = poDstDS->GetRasterYSize();
    4599        1930 :     const int nBandCount = poDstDS->GetRasterCount();
    4600             : 
    4601        1930 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    4602        3860 :         poSrcDS->GetRasterYSize() != nYSize ||
    4603        1930 :         poSrcDS->GetRasterCount() != nBandCount)
    4604             :     {
    4605           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    4606             :                  "Input and output dataset sizes or band counts do not\n"
    4607             :                  "match in GDALDatasetCopyWholeRaster()");
    4608           0 :         return CE_Failure;
    4609             :     }
    4610             : 
    4611             :     /* -------------------------------------------------------------------- */
    4612             :     /*      Report preliminary (0) progress.                                */
    4613             :     /* -------------------------------------------------------------------- */
    4614        1930 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    4615             :     {
    4616           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    4617             :                  "User terminated CreateCopy()");
    4618           1 :         return CE_Failure;
    4619             :     }
    4620             : 
    4621             :     /* -------------------------------------------------------------------- */
    4622             :     /*      Get our prototype band, and assume the others are similarly     */
    4623             :     /*      configured.                                                     */
    4624             :     /* -------------------------------------------------------------------- */
    4625        1929 :     if (nBandCount == 0)
    4626           0 :         return CE_None;
    4627             : 
    4628        1929 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    4629        1929 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    4630        1929 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4631             : 
    4632             :     /* -------------------------------------------------------------------- */
    4633             :     /*      Do we want to try and do the operation in a pixel               */
    4634             :     /*      interleaved fashion?                                            */
    4635             :     /* -------------------------------------------------------------------- */
    4636        1929 :     bool bInterleave = false;
    4637             :     const char *pszInterleave =
    4638        1929 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    4639        1929 :     if (pszInterleave != nullptr &&
    4640         997 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    4641         145 :         bInterleave = true;
    4642             : 
    4643        1929 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    4644        1929 :     if (pszInterleave != nullptr &&
    4645        1189 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    4646         480 :         bInterleave = true;
    4647             : 
    4648        1929 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    4649        1929 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    4650           5 :         bInterleave = true;
    4651        1924 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    4652           7 :         bInterleave = false;
    4653             :     // attributes is specific to the TileDB driver
    4654        1917 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    4655           4 :         bInterleave = true;
    4656        1913 :     else if (pszInterleave != nullptr)
    4657             :     {
    4658           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    4659             :                  "Unsupported value for option INTERLEAVE");
    4660             :     }
    4661             : 
    4662             :     // If the destination is compressed, we must try to write blocks just once,
    4663             :     // to save disk space (GTiff case for example), and to avoid data loss
    4664             :     // (JPEG compression for example).
    4665        1929 :     bool bDstIsCompressed = false;
    4666             :     const char *pszDstCompressed =
    4667        1929 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    4668        1929 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    4669         338 :         bDstIsCompressed = true;
    4670             : 
    4671             :     /* -------------------------------------------------------------------- */
    4672             :     /*      What will our swath size be?                                    */
    4673             :     /* -------------------------------------------------------------------- */
    4674             : 
    4675        1929 :     int nSwathCols = 0;
    4676        1929 :     int nSwathLines = 0;
    4677        1929 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    4678             :                                     nBandCount, bDstIsCompressed, bInterleave,
    4679             :                                     &nSwathCols, &nSwathLines);
    4680             : 
    4681        1929 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4682        1929 :     if (bInterleave)
    4683         508 :         nPixelSize *= nBandCount;
    4684             : 
    4685        1929 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    4686        1929 :     if (pSwathBuf == nullptr)
    4687             :     {
    4688           0 :         return CE_Failure;
    4689             :     }
    4690             : 
    4691        1929 :     CPLDebug("GDAL",
    4692             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    4693             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    4694             : 
    4695             :     // Advise the source raster that we are going to read it completely
    4696             :     // Note: this might already have been done by GDALCreateCopy() in the
    4697             :     // likely case this function is indirectly called by it
    4698        1929 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    4699        1929 :                         nullptr, nullptr);
    4700             : 
    4701             :     /* ==================================================================== */
    4702             :     /*      Band oriented (uninterleaved) case.                             */
    4703             :     /* ==================================================================== */
    4704        1929 :     CPLErr eErr = CE_None;
    4705             :     const bool bCheckHoles =
    4706        1929 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    4707             : 
    4708        1929 :     if (!bInterleave)
    4709             :     {
    4710             :         GDALRasterIOExtraArg sExtraArg;
    4711        1421 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    4712        1421 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    4713             : 
    4714        4263 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    4715        1421 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    4716        1421 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    4717        1421 :         GIntBig nBlocksDone = 0;
    4718             : 
    4719        3490 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    4720             :         {
    4721        2069 :             int nBand = iBand + 1;
    4722             : 
    4723        4292 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4724             :             {
    4725        2223 :                 int nThisLines = nSwathLines;
    4726             : 
    4727        2223 :                 if (iY + nThisLines > nYSize)
    4728         270 :                     nThisLines = nYSize - iY;
    4729             : 
    4730        4446 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    4731        2223 :                      iX += nSwathCols)
    4732             :                 {
    4733        2223 :                     int nThisCols = nSwathCols;
    4734             : 
    4735        2223 :                     if (iX + nThisCols > nXSize)
    4736           0 :                         nThisCols = nXSize - iX;
    4737             : 
    4738        2223 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    4739        2223 :                     if (bCheckHoles)
    4740             :                     {
    4741             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    4742         904 :                                       ->GetDataCoverageStatus(
    4743             :                                           iX, iY, nThisCols, nThisLines,
    4744             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    4745             :                     }
    4746        2223 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4747             :                     {
    4748        2220 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    4749        4440 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    4750        2220 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    4751        2220 :                             (nBlocksDone + 0.5) /
    4752        2220 :                                 static_cast<double>(nTotalBlocks),
    4753             :                             pfnProgress, pProgressData);
    4754        2220 :                         if (sExtraArg.pProgressData == nullptr)
    4755        1304 :                             sExtraArg.pfnProgress = nullptr;
    4756             : 
    4757        2220 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    4758             :                                                  nThisLines, pSwathBuf,
    4759             :                                                  nThisCols, nThisLines, eDT, 1,
    4760             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    4761             : 
    4762        2220 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    4763             : 
    4764        2220 :                         if (eErr == CE_None)
    4765        2216 :                             eErr = poDstDS->RasterIO(
    4766             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    4767             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    4768             :                                 &nBand, 0, 0, 0, nullptr);
    4769             :                     }
    4770             : 
    4771        2223 :                     nBlocksDone++;
    4772        4407 :                     if (eErr == CE_None &&
    4773        2184 :                         !pfnProgress(nBlocksDone /
    4774        2184 :                                          static_cast<double>(nTotalBlocks),
    4775             :                                      nullptr, pProgressData))
    4776             :                     {
    4777           3 :                         eErr = CE_Failure;
    4778           3 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    4779             :                                  "User terminated CreateCopy()");
    4780             :                     }
    4781             :                 }
    4782             :             }
    4783             :         }
    4784             :     }
    4785             : 
    4786             :     /* ==================================================================== */
    4787             :     /*      Pixel interleaved case.                                         */
    4788             :     /* ==================================================================== */
    4789             :     else /* if( bInterleave ) */
    4790             :     {
    4791             :         GDALRasterIOExtraArg sExtraArg;
    4792         508 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    4793         508 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    4794             : 
    4795         508 :         const GIntBig nTotalBlocks =
    4796         508 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    4797         508 :             DIV_ROUND_UP(nXSize, nSwathCols);
    4798         508 :         GIntBig nBlocksDone = 0;
    4799             : 
    4800        1216 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4801             :         {
    4802         708 :             int nThisLines = nSwathLines;
    4803             : 
    4804         708 :             if (iY + nThisLines > nYSize)
    4805         173 :                 nThisLines = nYSize - iY;
    4806             : 
    4807        1421 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    4808             :             {
    4809         713 :                 int nThisCols = nSwathCols;
    4810             : 
    4811         713 :                 if (iX + nThisCols > nXSize)
    4812           3 :                     nThisCols = nXSize - iX;
    4813             : 
    4814         713 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    4815         713 :                 if (bCheckHoles)
    4816             :                 {
    4817         437 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    4818             :                     {
    4819         437 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    4820         437 :                                        ->GetDataCoverageStatus(
    4821             :                                            iX, iY, nThisCols, nThisLines,
    4822             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    4823         437 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4824         437 :                             break;
    4825             :                     }
    4826             :                 }
    4827         713 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4828             :                 {
    4829         713 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    4830        1426 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    4831         713 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    4832         713 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    4833             :                         pfnProgress, pProgressData);
    4834         713 :                     if (sExtraArg.pProgressData == nullptr)
    4835         276 :                         sExtraArg.pfnProgress = nullptr;
    4836             : 
    4837         713 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    4838             :                                              nThisLines, pSwathBuf, nThisCols,
    4839             :                                              nThisLines, eDT, nBandCount,
    4840             :                                              nullptr, 0, 0, 0, &sExtraArg);
    4841             : 
    4842         713 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    4843             : 
    4844         713 :                     if (eErr == CE_None)
    4845         713 :                         eErr = poDstDS->RasterIO(
    4846             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    4847             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    4848             :                             0, 0, nullptr);
    4849             :                 }
    4850             : 
    4851         713 :                 nBlocksDone++;
    4852        1423 :                 if (eErr == CE_None &&
    4853         710 :                     !pfnProgress(nBlocksDone /
    4854         710 :                                      static_cast<double>(nTotalBlocks),
    4855             :                                  nullptr, pProgressData))
    4856             :                 {
    4857           1 :                     eErr = CE_Failure;
    4858           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    4859             :                              "User terminated CreateCopy()");
    4860             :                 }
    4861             :             }
    4862             :         }
    4863             :     }
    4864             : 
    4865             :     /* -------------------------------------------------------------------- */
    4866             :     /*      Cleanup                                                         */
    4867             :     /* -------------------------------------------------------------------- */
    4868        1929 :     CPLFree(pSwathBuf);
    4869             : 
    4870        1929 :     return eErr;
    4871             : }
    4872             : 
    4873             : /************************************************************************/
    4874             : /*                     GDALRasterBandCopyWholeRaster()                  */
    4875             : /************************************************************************/
    4876             : 
    4877             : /**
    4878             :  * \brief Copy a whole raster band
    4879             :  *
    4880             :  * This function copies the complete raster contents of one band to
    4881             :  * another similarly configured band.  The source and destination
    4882             :  * bands must have the same width and height.  The bands do not have
    4883             :  * to have the same data type.
    4884             :  *
    4885             :  * It implements efficient copying, in particular "chunking" the copy in
    4886             :  * substantial blocks.
    4887             :  *
    4888             :  * Currently the only papszOptions value supported are :
    4889             :  * <ul>
    4890             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    4891             :  * achieve best compression.</li>
    4892             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    4893             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    4894             :  * </ul>
    4895             :  *
    4896             :  * @param hSrcBand the source band
    4897             :  * @param hDstBand the destination band
    4898             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    4899             :  * @param pfnProgress progress reporting function.
    4900             :  * @param pProgressData callback data for progress function.
    4901             :  *
    4902             :  * @return CE_None on success, or CE_Failure on failure.
    4903             :  */
    4904             : 
    4905          19 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    4906             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    4907             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    4908             :     void *pProgressData)
    4909             : 
    4910             : {
    4911          19 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    4912          19 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    4913             : 
    4914          19 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4915          19 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    4916          19 :     CPLErr eErr = CE_None;
    4917             : 
    4918          19 :     if (pfnProgress == nullptr)
    4919           2 :         pfnProgress = GDALDummyProgress;
    4920             : 
    4921             :     /* -------------------------------------------------------------------- */
    4922             :     /*      Confirm the datasets match in size and band counts.             */
    4923             :     /* -------------------------------------------------------------------- */
    4924          19 :     int nXSize = poSrcBand->GetXSize();
    4925          19 :     int nYSize = poSrcBand->GetYSize();
    4926             : 
    4927          19 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    4928             :     {
    4929           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    4930             :                  "Input and output band sizes do not\n"
    4931             :                  "match in GDALRasterBandCopyWholeRaster()");
    4932           0 :         return CE_Failure;
    4933             :     }
    4934             : 
    4935             :     /* -------------------------------------------------------------------- */
    4936             :     /*      Report preliminary (0) progress.                                */
    4937             :     /* -------------------------------------------------------------------- */
    4938          19 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    4939             :     {
    4940           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    4941             :                  "User terminated CreateCopy()");
    4942           0 :         return CE_Failure;
    4943             :     }
    4944             : 
    4945          19 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    4946             : 
    4947             :     // If the destination is compressed, we must try to write blocks just once,
    4948             :     // to save disk space (GTiff case for example), and to avoid data loss
    4949             :     // (JPEG compression for example).
    4950          19 :     bool bDstIsCompressed = false;
    4951             :     const char *pszDstCompressed =
    4952          19 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    4953          19 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    4954          17 :         bDstIsCompressed = true;
    4955             : 
    4956             :     /* -------------------------------------------------------------------- */
    4957             :     /*      What will our swath size be?                                    */
    4958             :     /* -------------------------------------------------------------------- */
    4959             : 
    4960          19 :     int nSwathCols = 0;
    4961          19 :     int nSwathLines = 0;
    4962          19 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    4963             :                                     FALSE, &nSwathCols, &nSwathLines);
    4964             : 
    4965          19 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4966             : 
    4967          19 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    4968          19 :     if (pSwathBuf == nullptr)
    4969             :     {
    4970           0 :         return CE_Failure;
    4971             :     }
    4972             : 
    4973          19 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    4974             :              nSwathCols, nSwathLines);
    4975             : 
    4976             :     const bool bCheckHoles =
    4977          19 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    4978             : 
    4979             :     // Advise the source raster that we are going to read it completely
    4980          19 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    4981             : 
    4982             :     /* ==================================================================== */
    4983             :     /*      Band oriented (uninterleaved) case.                             */
    4984             :     /* ==================================================================== */
    4985             : 
    4986          47 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4987             :     {
    4988          28 :         int nThisLines = nSwathLines;
    4989             : 
    4990          28 :         if (iY + nThisLines > nYSize)
    4991           3 :             nThisLines = nYSize - iY;
    4992             : 
    4993          56 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    4994             :         {
    4995          28 :             int nThisCols = nSwathCols;
    4996             : 
    4997          28 :             if (iX + nThisCols > nXSize)
    4998           0 :                 nThisCols = nXSize - iX;
    4999             : 
    5000          28 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5001          28 :             if (bCheckHoles)
    5002             :             {
    5003           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5004             :                     iX, iY, nThisCols, nThisLines,
    5005             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5006             :             }
    5007          28 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5008             :             {
    5009          28 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5010             :                                            nThisLines, pSwathBuf, nThisCols,
    5011             :                                            nThisLines, eDT, 0, 0, nullptr);
    5012             : 
    5013          28 :                 if (eErr == CE_None)
    5014          28 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5015             :                                                nThisLines, pSwathBuf, nThisCols,
    5016             :                                                nThisLines, eDT, 0, 0, nullptr);
    5017             :             }
    5018             : 
    5019          56 :             if (eErr == CE_None &&
    5020          28 :                 !pfnProgress((iY + nThisLines) / static_cast<float>(nYSize),
    5021             :                              nullptr, pProgressData))
    5022             :             {
    5023           0 :                 eErr = CE_Failure;
    5024           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5025             :                          "User terminated CreateCopy()");
    5026             :             }
    5027             :         }
    5028             :     }
    5029             : 
    5030             :     /* -------------------------------------------------------------------- */
    5031             :     /*      Cleanup                                                         */
    5032             :     /* -------------------------------------------------------------------- */
    5033          19 :     CPLFree(pSwathBuf);
    5034             : 
    5035          19 :     return eErr;
    5036             : }
    5037             : 
    5038             : /************************************************************************/
    5039             : /*                      GDALCopyRasterIOExtraArg ()                     */
    5040             : /************************************************************************/
    5041             : 
    5042      323808 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5043             :                               GDALRasterIOExtraArg *psSrcArg)
    5044             : {
    5045      323808 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5046      323808 :     if (psSrcArg)
    5047             :     {
    5048      323808 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5049      323808 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5050      323808 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5051      323808 :         psDestArg->bFloatingPointWindowValidity =
    5052      323808 :             psSrcArg->bFloatingPointWindowValidity;
    5053      323808 :         if (psSrcArg->bFloatingPointWindowValidity)
    5054             :         {
    5055        3632 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5056        3632 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5057        3632 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5058        3632 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5059             :         }
    5060             :     }
    5061      323808 : }
    5062             : 
    5063             : /************************************************************************/
    5064             : /*                         HasOnlyNoData()                              */
    5065             : /************************************************************************/
    5066             : 
    5067     4798012 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    5068             : {
    5069     4798012 :     return value == noDataValue;
    5070             : }
    5071             : 
    5072      560355 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    5073             : {
    5074      560355 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5075             : }
    5076             : 
    5077      501114 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    5078             : {
    5079      501114 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5080             : }
    5081             : 
    5082             : template <class T>
    5083        2062 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    5084             :                            size_t nHeight, size_t nLineStride,
    5085             :                            size_t nComponents)
    5086             : {
    5087             :     // Fast test: check the 4 corners and the middle pixel.
    5088        3404 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    5089             :     {
    5090        4863 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    5091        2346 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    5092        2270 :                               noDataValue) &&
    5093        2270 :               IsEqualToNoData(
    5094        2270 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    5095        2270 :                               nComponents +
    5096             :                           iBand],
    5097        1353 :                   noDataValue) &&
    5098        1353 :               IsEqualToNoData(
    5099        1353 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    5100             :                   noDataValue) &&
    5101        1345 :               IsEqualToNoData(
    5102        1345 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    5103        1345 :                               nComponents +
    5104             :                           iBand],
    5105             :                   noDataValue)))
    5106             :         {
    5107        1175 :             return false;
    5108             :         }
    5109             :     }
    5110             : 
    5111             :     // Test all pixels.
    5112       17778 :     for (size_t iY = 0; iY < nHeight; iY++)
    5113             :     {
    5114       16915 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    5115     5866544 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    5116             :         {
    5117     5849649 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    5118             :             {
    5119          24 :                 return false;
    5120             :             }
    5121             :         }
    5122             :     }
    5123         863 :     return true;
    5124             : }
    5125             : 
    5126             : /************************************************************************/
    5127             : /*                    GDALBufferHasOnlyNoData()                         */
    5128             : /************************************************************************/
    5129             : 
    5130       25296 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    5131             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    5132             :                              size_t nComponents, int nBitsPerSample,
    5133             :                              GDALBufferSampleFormat nSampleFormat)
    5134             : {
    5135             :     // In the case where the nodata is 0, we can compare several bytes at
    5136             :     // once. Select the largest natural integer type for the architecture.
    5137             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    5138             :     // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    5139             :     typedef std::uint64_t WordType;
    5140             : #else
    5141             :     typedef std::uint32_t WordType;
    5142             : #endif
    5143       25296 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    5144             :         // Do not use this optimized code path for floating point numbers,
    5145             :         // as it can't detect negative zero.
    5146             :         nSampleFormat != GSF_FLOATING_POINT)
    5147             :     {
    5148       23228 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    5149       23228 :         const size_t nSize =
    5150       23228 :             (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
    5151       23228 :         size_t i = 0;
    5152             :         const size_t nInitialIters =
    5153       46456 :             std::min(sizeof(WordType) -
    5154       23228 :                          static_cast<size_t>(
    5155             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    5156             :                              sizeof(WordType)),
    5157       23228 :                      nSize);
    5158      194466 :         for (; i < nInitialIters; i++)
    5159             :         {
    5160      174542 :             if (pabyBuffer[i])
    5161        3304 :                 return false;
    5162             :         }
    5163    16038800 :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    5164             :         {
    5165    16024300 :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    5166        5389 :                 return false;
    5167             :         }
    5168       51890 :         for (; i < nSize; i++)
    5169             :         {
    5170       37360 :             if (pabyBuffer[i])
    5171           5 :                 return false;
    5172             :         }
    5173       14530 :         return true;
    5174             :     }
    5175             : 
    5176        2068 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    5177             :     {
    5178        2230 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    5179        1115 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    5180        1115 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    5181        1115 :                               nHeight, nLineStride, nComponents);
    5182             :     }
    5183         953 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    5184             :     {
    5185             :         // Use unsigned implementation by converting the nodatavalue to
    5186             :         // unsigned
    5187          63 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    5188          31 :                HasOnlyNoDataT(
    5189             :                    static_cast<const uint8_t *>(pBuffer),
    5190          31 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    5191          32 :                    nWidth, nHeight, nLineStride, nComponents);
    5192             :     }
    5193         921 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    5194             :     {
    5195          21 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    5196          10 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    5197          10 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    5198          11 :                               nHeight, nLineStride, nComponents);
    5199             :     }
    5200         910 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    5201             :     {
    5202             :         // Use unsigned implementation by converting the nodatavalue to
    5203             :         // unsigned
    5204          79 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    5205          39 :                HasOnlyNoDataT(
    5206             :                    static_cast<const uint16_t *>(pBuffer),
    5207          39 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    5208          40 :                    nWidth, nHeight, nLineStride, nComponents);
    5209             :     }
    5210         870 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    5211             :     {
    5212          73 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    5213          36 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    5214             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    5215          37 :                               nHeight, nLineStride, nComponents);
    5216             :     }
    5217         833 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    5218             :     {
    5219             :         // Use unsigned implementation by converting the nodatavalue to
    5220             :         // unsigned
    5221          19 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    5222           9 :                HasOnlyNoDataT(
    5223             :                    static_cast<const uint32_t *>(pBuffer),
    5224           9 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    5225          10 :                    nWidth, nHeight, nLineStride, nComponents);
    5226             :     }
    5227         823 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    5228             :     {
    5229          56 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    5230          28 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    5231             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    5232          28 :                               nHeight, nLineStride, nComponents);
    5233             :     }
    5234         795 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    5235             :     {
    5236             :         // Use unsigned implementation by converting the nodatavalue to
    5237             :         // unsigned
    5238           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    5239           0 :                HasOnlyNoDataT(
    5240             :                    static_cast<const uint64_t *>(pBuffer),
    5241           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    5242           0 :                    nWidth, nHeight, nLineStride, nComponents);
    5243             :     }
    5244         795 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    5245             :     {
    5246        1323 :         return (std::isnan(dfNoDataValue) ||
    5247        1352 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    5248         675 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    5249             :                               static_cast<float>(dfNoDataValue), nWidth,
    5250         676 :                               nHeight, nLineStride, nComponents);
    5251             :     }
    5252         119 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    5253             :     {
    5254         119 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    5255             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    5256         119 :                               nComponents);
    5257             :     }
    5258           0 :     return false;
    5259             : }
    5260             : 
    5261             : #if defined(__x86_64) || defined(_M_X64)
    5262             : 
    5263             : #include <emmintrin.h>
    5264             : 
    5265             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5266             : #include "rasterio_ssse3.h"
    5267             : #endif
    5268             : 
    5269             : /************************************************************************/
    5270             : /*                    GDALDeinterleave3Byte()                           */
    5271             : /************************************************************************/
    5272             : 
    5273             : #if defined(__GNUC__) && !defined(__clang__)
    5274             : __attribute__((optimize("no-tree-vectorize")))
    5275             : #endif
    5276             : static void
    5277       58262 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5278             :                       GByte *CPL_RESTRICT pabyDest0,
    5279             :                       GByte *CPL_RESTRICT pabyDest1,
    5280             :                       GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5281             : {
    5282             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5283       58262 :     if (CPLHaveRuntimeSSSE3())
    5284             :     {
    5285       58261 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5286       58261 :                                            pabyDest2, nIters);
    5287             :     }
    5288             : #endif
    5289             : 
    5290           2 :     size_t i = 0;
    5291           2 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    5292           2 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    5293           2 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    5294           2 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    5295             :          sizeof(unsigned int)) == 0)
    5296             :     {
    5297             :         // Slightly better than GCC autovectorizer
    5298          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    5299             :         {
    5300          15 :             unsigned int word0 =
    5301          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    5302          15 :             unsigned int word1 =
    5303          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    5304          15 :             unsigned int word2 =
    5305          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    5306          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    5307          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    5308          15 :                 ((word2 >> 8) << 24);
    5309          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    5310          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    5311          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    5312          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    5313          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    5314          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    5315          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    5316             :         }
    5317             :     }
    5318             : #if defined(__clang__)
    5319             : #pragma clang loop vectorize(disable)
    5320             : #endif
    5321           3 :     for (; i < nIters; ++i)
    5322             :     {
    5323           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    5324           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    5325           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    5326             :     }
    5327             : }
    5328             : 
    5329             : /************************************************************************/
    5330             : /*                    GDALDeinterleave4Byte()                           */
    5331             : /************************************************************************/
    5332             : 
    5333             : #if !defined(__GNUC__) || defined(__clang__)
    5334             : 
    5335             : /************************************************************************/
    5336             : /*                         deinterleave()                               */
    5337             : /************************************************************************/
    5338             : 
    5339             : template <bool SHIFT, bool MASK>
    5340             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    5341             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    5342             : {
    5343             :     // Set higher 24bit of each int32 packed word to 0
    5344             :     if (SHIFT)
    5345             :     {
    5346             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    5347             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    5348             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    5349             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    5350             :     }
    5351             :     __m128i xmm0;
    5352             :     __m128i xmm1;
    5353             :     __m128i xmm2;
    5354             :     __m128i xmm3;
    5355             :     if (MASK)
    5356             :     {
    5357             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    5358             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    5359             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    5360             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    5361             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    5362             :     }
    5363             :     else
    5364             :     {
    5365             :         xmm0 = xmm0_ori;
    5366             :         xmm1 = xmm1_ori;
    5367             :         xmm2 = xmm2_ori;
    5368             :         xmm3 = xmm3_ori;
    5369             :     }
    5370             :     // Pack int32 to int16
    5371             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    5372             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    5373             :     // Pack int16 to uint8
    5374             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    5375             :     return xmm0;
    5376             : }
    5377             : 
    5378             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5379             :                                   GByte *CPL_RESTRICT pabyDest0,
    5380             :                                   GByte *CPL_RESTRICT pabyDest1,
    5381             :                                   GByte *CPL_RESTRICT pabyDest2,
    5382             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5383             : {
    5384             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5385             :     if (CPLHaveRuntimeSSSE3())
    5386             :     {
    5387             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5388             :                                            pabyDest2, pabyDest3, nIters);
    5389             :     }
    5390             : #endif
    5391             : 
    5392             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    5393             :     // do something slightly better.
    5394             :     size_t i = 0;
    5395             :     for (; i + 15 < nIters; i += 16)
    5396             :     {
    5397             :         __m128i xmm0_ori = _mm_loadu_si128(
    5398             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    5399             :         __m128i xmm1_ori = _mm_loadu_si128(
    5400             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    5401             :         __m128i xmm2_ori = _mm_loadu_si128(
    5402             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    5403             :         __m128i xmm3_ori = _mm_loadu_si128(
    5404             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    5405             : 
    5406             :         _mm_storeu_si128(
    5407             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    5408             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5409             :         _mm_storeu_si128(
    5410             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    5411             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5412             :         _mm_storeu_si128(
    5413             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    5414             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5415             :         _mm_storeu_si128(
    5416             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    5417             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5418             :     }
    5419             : 
    5420             : #if defined(__clang__)
    5421             : #pragma clang loop vectorize(disable)
    5422             : #endif
    5423             :     for (; i < nIters; ++i)
    5424             :     {
    5425             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5426             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5427             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5428             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5429             :     }
    5430             : }
    5431             : #else
    5432             : // GCC autovectorizer does an excellent job
    5433       50228 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    5434             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    5435             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    5436             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5437             : {
    5438   522918000 :     for (size_t i = 0; i < nIters; ++i)
    5439             :     {
    5440   522867000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    5441   522867000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    5442   522867000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    5443   522867000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    5444             :     }
    5445       50228 : }
    5446             : #endif
    5447             : 
    5448             : #else
    5449             : 
    5450             : /************************************************************************/
    5451             : /*                    GDALDeinterleave3Byte()                           */
    5452             : /************************************************************************/
    5453             : 
    5454             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    5455             : // how to auto-vectorize
    5456             : // #if defined(__GNUC__)
    5457             : //__attribute__((optimize("tree-vectorize")))
    5458             : // #endif
    5459             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5460             :                                   GByte *CPL_RESTRICT pabyDest0,
    5461             :                                   GByte *CPL_RESTRICT pabyDest1,
    5462             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5463             : {
    5464             :     for (size_t i = 0; i < nIters; ++i)
    5465             :     {
    5466             :         pabyDest0[i] = pabySrc[3 * i + 0];
    5467             :         pabyDest1[i] = pabySrc[3 * i + 1];
    5468             :         pabyDest2[i] = pabySrc[3 * i + 2];
    5469             :     }
    5470             : }
    5471             : 
    5472             : /************************************************************************/
    5473             : /*                    GDALDeinterleave4Byte()                           */
    5474             : /************************************************************************/
    5475             : 
    5476             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    5477             : // how to auto-vectorize
    5478             : // #if defined(__GNUC__)
    5479             : //__attribute__((optimize("tree-vectorize")))
    5480             : // #endif
    5481             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5482             :                                   GByte *CPL_RESTRICT pabyDest0,
    5483             :                                   GByte *CPL_RESTRICT pabyDest1,
    5484             :                                   GByte *CPL_RESTRICT pabyDest2,
    5485             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5486             : {
    5487             :     for (size_t i = 0; i < nIters; ++i)
    5488             :     {
    5489             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5490             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5491             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5492             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5493             :     }
    5494             : }
    5495             : 
    5496             : #endif
    5497             : 
    5498             : /************************************************************************/
    5499             : /*                      GDALDeinterleave()                              */
    5500             : /************************************************************************/
    5501             : 
    5502             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    5503             :     buffers.
    5504             : 
    5505             :     In pseudo-code
    5506             :     \verbatim
    5507             :     for(size_t i = 0; i < nIters; ++i)
    5508             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    5509             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    5510             :     \endverbatim
    5511             : 
    5512             :     The implementation is optimized for a few cases, like de-interleaving
    5513             :     of 3 or 4-components Byte buffers.
    5514             : 
    5515             :     \since GDAL 3.6
    5516             :  */
    5517      109244 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    5518             :                       int nComponents, void **ppDestBuffer,
    5519             :                       GDALDataType eDestDT, size_t nIters)
    5520             : {
    5521      109244 :     if (eSourceDT == eDestDT)
    5522             :     {
    5523      109222 :         if (eSourceDT == GDT_Byte)
    5524             :         {
    5525      108491 :             if (nComponents == 3)
    5526             :             {
    5527       58263 :                 const GByte *CPL_RESTRICT pabySrc =
    5528             :                     static_cast<const GByte *>(pSourceBuffer);
    5529       58263 :                 GByte *CPL_RESTRICT pabyDest0 =
    5530             :                     static_cast<GByte *>(ppDestBuffer[0]);
    5531       58263 :                 GByte *CPL_RESTRICT pabyDest1 =
    5532             :                     static_cast<GByte *>(ppDestBuffer[1]);
    5533       58263 :                 GByte *CPL_RESTRICT pabyDest2 =
    5534             :                     static_cast<GByte *>(ppDestBuffer[2]);
    5535       58263 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5536             :                                       nIters);
    5537       58263 :                 return;
    5538             :             }
    5539       50228 :             else if (nComponents == 4)
    5540             :             {
    5541       50228 :                 const GByte *CPL_RESTRICT pabySrc =
    5542             :                     static_cast<const GByte *>(pSourceBuffer);
    5543       50228 :                 GByte *CPL_RESTRICT pabyDest0 =
    5544             :                     static_cast<GByte *>(ppDestBuffer[0]);
    5545       50228 :                 GByte *CPL_RESTRICT pabyDest1 =
    5546             :                     static_cast<GByte *>(ppDestBuffer[1]);
    5547       50228 :                 GByte *CPL_RESTRICT pabyDest2 =
    5548             :                     static_cast<GByte *>(ppDestBuffer[2]);
    5549       50228 :                 GByte *CPL_RESTRICT pabyDest3 =
    5550             :                     static_cast<GByte *>(ppDestBuffer[3]);
    5551       50228 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5552             :                                       pabyDest3, nIters);
    5553       50229 :                 return;
    5554             :             }
    5555             :         }
    5556             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    5557             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    5558             :     (defined(__x86_64) || defined(_M_X64)) &&                                  \
    5559             :     defined(HAVE_SSSE3_AT_COMPILE_TIME)
    5560        1462 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    5561         731 :                  CPLHaveRuntimeSSSE3())
    5562             :         {
    5563         731 :             if (nComponents == 3)
    5564             :             {
    5565         237 :                 const GUInt16 *CPL_RESTRICT panSrc =
    5566             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    5567         237 :                 GUInt16 *CPL_RESTRICT panDest0 =
    5568             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    5569         237 :                 GUInt16 *CPL_RESTRICT panDest1 =
    5570             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    5571         237 :                 GUInt16 *CPL_RESTRICT panDest2 =
    5572             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    5573         237 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    5574             :                                               panDest2, nIters);
    5575         237 :                 return;
    5576             :             }
    5577             : #if !defined(__INTEL_CLANG_COMPILER)
    5578             :             // ICC autovectorizer doesn't do a good job, at least with icx
    5579             :             // 2022.1.0.20220316
    5580         494 :             else if (nComponents == 4)
    5581             :             {
    5582         494 :                 const GUInt16 *CPL_RESTRICT panSrc =
    5583             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    5584         494 :                 GUInt16 *CPL_RESTRICT panDest0 =
    5585             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    5586         494 :                 GUInt16 *CPL_RESTRICT panDest1 =
    5587             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    5588         494 :                 GUInt16 *CPL_RESTRICT panDest2 =
    5589             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    5590         494 :                 GUInt16 *CPL_RESTRICT panDest3 =
    5591             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    5592         494 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    5593             :                                               panDest2, panDest3, nIters);
    5594         494 :                 return;
    5595             :             }
    5596             : #endif
    5597             :         }
    5598             : #endif
    5599             :     }
    5600             : 
    5601          22 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    5602          22 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    5603          87 :     for (int iComp = 0; iComp < nComponents; iComp++)
    5604             :     {
    5605          65 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    5606          65 :                             iComp * nSourceDTSize,
    5607             :                         eSourceDT, nComponents * nSourceDTSize,
    5608          65 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    5609             :     }
    5610             : }

Generated by: LCOV version 1.14