LCOV - code coverage report
Current view: top level - gcore - rasterio.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2386 2603 91.7 %
Date: 2025-01-18 12:42:00 Functions: 555 589 94.2 %

          Line data    Source code
       1             : /******************************************************************************
       2             :  *
       3             :  * Project:  GDAL Core
       4             :  * Purpose:  Contains default implementation of GDALRasterBand::IRasterIO()
       5             :  *           and supporting functions of broader utility.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 1998, Frank Warmerdam
      10             :  * Copyright (c) 2007-2014, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal.h"
      17             : #include "gdal_priv.h"
      18             : 
      19             : #include <cassert>
      20             : #include <climits>
      21             : #include <cmath>
      22             : #include <cstddef>
      23             : #include <cstdio>
      24             : #include <cstdlib>
      25             : #include <cstring>
      26             : 
      27             : #include <algorithm>
      28             : #include <limits>
      29             : #include <stdexcept>
      30             : #include <type_traits>
      31             : 
      32             : #include "cpl_conv.h"
      33             : #include "cpl_cpu_features.h"
      34             : #include "cpl_error.h"
      35             : #include "cpl_progress.h"
      36             : #include "cpl_string.h"
      37             : #include "cpl_vsi.h"
      38             : #include "gdal_priv_templates.hpp"
      39             : #include "gdal_vrt.h"
      40             : #include "gdalwarper.h"
      41             : #include "memdataset.h"
      42             : #include "vrtdataset.h"
      43             : 
      44             : #if defined(__x86_64) || defined(_M_X64)
      45             : #include <emmintrin.h>
      46             : #define HAVE_SSE2
      47             : #elif defined(USE_NEON_OPTIMIZATIONS)
      48             : #include "include_sse2neon.h"
      49             : #define HAVE_SSE2
      50             : #endif
      51             : 
      52             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
      53             : #include "rasterio_ssse3.h"
      54             : #ifdef __SSSE3__
      55             : #include <tmmintrin.h>
      56             : #endif
      57             : #endif
      58             : 
      59             : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
      60             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
      61             :                              int nDstPixelStride, GPtrDiff_t nWordCount);
      62             : 
      63             : /************************************************************************/
      64             : /*                    DownsamplingIntegerXFactor()                      */
      65             : /************************************************************************/
      66             : 
      67             : template <bool bSameDataType, int DATA_TYPE_SIZE>
      68      413236 : static bool DownsamplingIntegerXFactor(
      69             :     GDALRasterBand *poBand, int iSrcX, int nSrcXInc, GPtrDiff_t iSrcOffsetCst,
      70             :     GByte *CPL_RESTRICT pabyDstData, int nPixelSpace, int nBufXSize,
      71             :     GDALDataType eDataType, GDALDataType eBufType, int &nStartBlockX,
      72             :     int nBlockXSize, GDALRasterBlock *&poBlock, int nLBlockY)
      73             : {
      74      413236 :     const int nBandDataSize =
      75             :         bSameDataType ? DATA_TYPE_SIZE : GDALGetDataTypeSizeBytes(eDataType);
      76      413236 :     int nOuterLoopIters = nBufXSize - 1;
      77      413236 :     const int nIncSrcOffset = nSrcXInc * nBandDataSize;
      78             :     const GByte *CPL_RESTRICT pabySrcData;
      79      413236 :     int nEndBlockX = nBlockXSize + nStartBlockX;
      80             : 
      81      413236 :     if (iSrcX < nEndBlockX)
      82             :     {
      83      226134 :         CPLAssert(poBlock);
      84      226134 :         goto no_reload_block;
      85             :     }
      86      187102 :     goto reload_block;
      87             : 
      88             :     // Don't do the last iteration in the loop, as iSrcX might go beyond
      89             :     // nRasterXSize - 1
      90      932852 :     while (--nOuterLoopIters >= 1)
      91             :     {
      92      189034 :         iSrcX += nSrcXInc;
      93      189034 :         pabySrcData += nIncSrcOffset;
      94      189034 :         pabyDstData += nPixelSpace;
      95             : 
      96             :         /* --------------------------------------------------------------------
      97             :          */
      98             :         /*      Ensure we have the appropriate block loaded. */
      99             :         /* --------------------------------------------------------------------
     100             :          */
     101      189034 :         if (iSrcX >= nEndBlockX)
     102             :         {
     103      189034 :         reload_block:
     104             :         {
     105      388726 :             const int nLBlockX = iSrcX / nBlockXSize;
     106      388726 :             nStartBlockX = nLBlockX * nBlockXSize;
     107      388726 :             nEndBlockX = nStartBlockX + nBlockXSize;
     108             : 
     109      388726 :             if (poBlock != nullptr)
     110      316739 :                 poBlock->DropLock();
     111             : 
     112      388726 :             poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     113      388726 :             if (poBlock == nullptr)
     114             :             {
     115           1 :                 return false;
     116             :             }
     117             :         }
     118             : 
     119      388725 :         no_reload_block:
     120             :             const GByte *pabySrcBlock =
     121      932852 :                 static_cast<const GByte *>(poBlock->GetDataRef());
     122      932852 :             GPtrDiff_t iSrcOffset =
     123      932852 :                 (iSrcX - nStartBlockX + iSrcOffsetCst) * nBandDataSize;
     124      932852 :             pabySrcData = pabySrcBlock + iSrcOffset;
     125             :         }
     126             : 
     127             :         /* --------------------------------------------------------------------
     128             :          */
     129             :         /*      Copy the maximum run of pixels. */
     130             :         /* --------------------------------------------------------------------
     131             :          */
     132             : 
     133      932852 :         const int nIters = std::min(
     134      932852 :             (nEndBlockX - iSrcX + (nSrcXInc - 1)) / nSrcXInc, nOuterLoopIters);
     135             :         if (bSameDataType)
     136             :         {
     137      932447 :             memcpy(pabyDstData, pabySrcData, nBandDataSize);
     138      932447 :             if (nIters > 1)
     139             :             {
     140             :                 if (DATA_TYPE_SIZE == 1)
     141             :                 {
     142      276287 :                     pabySrcData += nIncSrcOffset;
     143      276287 :                     pabyDstData += nPixelSpace;
     144      276287 :                     GDALFastCopyByte(pabySrcData, nIncSrcOffset, pabyDstData,
     145      276287 :                                      nPixelSpace, nIters - 1);
     146      276287 :                     pabySrcData +=
     147      276287 :                         static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 2);
     148      276287 :                     pabyDstData +=
     149      276287 :                         static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 2);
     150             :                 }
     151             :                 else
     152             :                 {
     153     4443828 :                     for (int i = 0; i < nIters - 1; i++)
     154             :                     {
     155     4245254 :                         pabySrcData += nIncSrcOffset;
     156     4245254 :                         pabyDstData += nPixelSpace;
     157     4245254 :                         memcpy(pabyDstData, pabySrcData, nBandDataSize);
     158             :                     }
     159             :                 }
     160      474861 :                 iSrcX += nSrcXInc * (nIters - 1);
     161      474861 :                 nOuterLoopIters -= nIters - 1;
     162             :             }
     163             :         }
     164             :         else
     165             :         {
     166             :             // Type to type conversion ...
     167         405 :             GDALCopyWords64(pabySrcData, eDataType, nIncSrcOffset, pabyDstData,
     168         405 :                             eBufType, nPixelSpace, std::max(1, nIters));
     169         405 :             if (nIters > 1)
     170             :             {
     171         198 :                 pabySrcData +=
     172         198 :                     static_cast<GPtrDiff_t>(nIncSrcOffset) * (nIters - 1);
     173         198 :                 pabyDstData +=
     174         198 :                     static_cast<GPtrDiff_t>(nPixelSpace) * (nIters - 1);
     175         198 :                 iSrcX += nSrcXInc * (nIters - 1);
     176         198 :                 nOuterLoopIters -= nIters - 1;
     177             :             }
     178             :         }
     179             :     }
     180             : 
     181             :     // Deal with last iteration to avoid iSrcX to go beyond nRasterXSize - 1
     182      743818 :     if (nOuterLoopIters == 0)
     183             :     {
     184      330583 :         const int nRasterXSize = poBand->GetXSize();
     185      330583 :         iSrcX =
     186      661166 :             static_cast<int>(std::min(static_cast<GInt64>(iSrcX) + nSrcXInc,
     187      330583 :                                       static_cast<GInt64>(nRasterXSize - 1)));
     188      330583 :         pabyDstData += nPixelSpace;
     189      330583 :         if (iSrcX < nEndBlockX)
     190             :         {
     191      317993 :             goto no_reload_block;
     192             :         }
     193       12590 :         goto reload_block;
     194             :     }
     195      413235 :     return true;
     196             : }
     197             : 
     198             : /************************************************************************/
     199             : /*                             IRasterIO()                              */
     200             : /*                                                                      */
     201             : /*      Default internal implementation of RasterIO() ... utilizes      */
     202             : /*      the Block access methods to satisfy the request.  This would    */
     203             : /*      normally only be overridden by formats with overviews.          */
     204             : /************************************************************************/
     205             : 
     206     5706710 : CPLErr GDALRasterBand::IRasterIO(GDALRWFlag eRWFlag, int nXOff, int nYOff,
     207             :                                  int nXSize, int nYSize, void *pData,
     208             :                                  int nBufXSize, int nBufYSize,
     209             :                                  GDALDataType eBufType, GSpacing nPixelSpace,
     210             :                                  GSpacing nLineSpace,
     211             :                                  GDALRasterIOExtraArg *psExtraArg)
     212             : 
     213             : {
     214     5706710 :     if (eRWFlag == GF_Write && eFlushBlockErr != CE_None)
     215             :     {
     216           0 :         CPLError(eFlushBlockErr, CPLE_AppDefined,
     217             :                  "An error occurred while writing a dirty block "
     218             :                  "from GDALRasterBand::IRasterIO");
     219           0 :         CPLErr eErr = eFlushBlockErr;
     220           0 :         eFlushBlockErr = CE_None;
     221           0 :         return eErr;
     222             :     }
     223     5706710 :     if (nBlockXSize <= 0 || nBlockYSize <= 0)
     224             :     {
     225          86 :         CPLError(CE_Failure, CPLE_AppDefined, "Invalid block size");
     226           0 :         return CE_Failure;
     227             :     }
     228             : 
     229     5706620 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
     230     5706580 :     const int nBufDataSize = GDALGetDataTypeSizeBytes(eBufType);
     231     5706550 :     GByte dummyBlock[2] = {0, 0};
     232     5706550 :     GByte *pabySrcBlock =
     233             :         dummyBlock; /* to avoid Coverity warning about nullptr dereference */
     234     5706550 :     GDALRasterBlock *poBlock = nullptr;
     235     5706550 :     const bool bUseIntegerRequestCoords =
     236     5745890 :         (!psExtraArg->bFloatingPointWindowValidity ||
     237       39339 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
     238       15979 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
     239             : 
     240             :     /* ==================================================================== */
     241             :     /*      A common case is the data requested with the destination        */
     242             :     /*      is packed, and the block width is the raster width.             */
     243             :     /* ==================================================================== */
     244     5624630 :     if (nPixelSpace == nBufDataSize && nLineSpace == nPixelSpace * nXSize &&
     245     2945030 :         nBlockXSize == GetXSize() && nBufXSize == nXSize &&
     246    11331200 :         nBufYSize == nYSize && bUseIntegerRequestCoords)
     247             :     {
     248     2812690 :         CPLErr eErr = CE_None;
     249     2812690 :         int nLBlockY = -1;
     250             : 
     251     8202380 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     252             :         {
     253     5390360 :             const int iSrcY = iBufYOff + nYOff;
     254             : 
     255     5390360 :             if (iSrcY < nLBlockY * nBlockYSize ||
     256     5390410 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
     257             :             {
     258     3054820 :                 nLBlockY = iSrcY / nBlockYSize;
     259     3054820 :                 bool bJustInitialize =
     260       97703 :                     eRWFlag == GF_Write && nXOff == 0 &&
     261     3203980 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     262       51455 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize;
     263             : 
     264             :                 // Is this a partial tile at right and/or bottom edges of
     265             :                 // the raster, and that is going to be completely written?
     266             :                 // If so, do not load it from storage, but zero it so that
     267             :                 // the content outsize of the validity area is initialized.
     268     3054820 :                 bool bMemZeroBuffer = false;
     269       97703 :                 if (eRWFlag == GF_Write && !bJustInitialize && nXOff == 0 &&
     270       21961 :                     nXSize == nBlockXSize && nYOff <= nLBlockY * nBlockYSize &&
     271     3152610 :                     nYOff + nYSize == GetYSize() &&
     272          89 :                     nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)
     273             :                 {
     274          89 :                     bJustInitialize = true;
     275          89 :                     bMemZeroBuffer = true;
     276             :                 }
     277             : 
     278     3054820 :                 if (poBlock)
     279      242083 :                     poBlock->DropLock();
     280             : 
     281     3054820 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     282     3054780 :                 poBlock = GetLockedBlockRef(0, nLBlockY, bJustInitialize);
     283     3054980 :                 if (poBlock == nullptr)
     284             :                 {
     285        1067 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     286             :                         nullptr)
     287             :                     {
     288           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     289             :                                  "GetBlockRef failed at X block offset %d, "
     290             :                                  "Y block offset %d%s",
     291             :                                  0, nLBlockY,
     292           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     293           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     294             :                                      : "");
     295             :                     }
     296        1067 :                     eErr = CE_Failure;
     297        1067 :                     break;
     298             :                 }
     299             : 
     300     3053910 :                 if (eRWFlag == GF_Write)
     301       97703 :                     poBlock->MarkDirty();
     302             : 
     303     3053910 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     304     3053910 :                 if (bMemZeroBuffer)
     305             :                 {
     306          89 :                     memset(pabySrcBlock, 0,
     307          89 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     308          89 :                                nBlockXSize * nBlockYSize);
     309             :                 }
     310             :             }
     311             : 
     312     5389450 :             const auto nSrcByteOffset =
     313     5389450 :                 (static_cast<GPtrDiff_t>(iSrcY - nLBlockY * nBlockYSize) *
     314     5389450 :                      nBlockXSize +
     315     5389450 :                  nXOff) *
     316     5389450 :                 nBandDataSize;
     317             : 
     318     5389450 :             if (eDataType == eBufType)
     319             :             {
     320     1740010 :                 if (eRWFlag == GF_Read)
     321     1498150 :                     memcpy(static_cast<GByte *>(pData) +
     322     1498150 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     323     1498150 :                            pabySrcBlock + nSrcByteOffset,
     324             :                            static_cast<size_t>(nLineSpace));
     325             :                 else
     326      241865 :                     memcpy(pabySrcBlock + nSrcByteOffset,
     327      241865 :                            static_cast<GByte *>(pData) +
     328      241865 :                                static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     329             :                            static_cast<size_t>(nLineSpace));
     330             :             }
     331             :             else
     332             :             {
     333             :                 // Type to type conversion.
     334             : 
     335     3649430 :                 if (eRWFlag == GF_Read)
     336     3628880 :                     GDALCopyWords64(
     337     3628880 :                         pabySrcBlock + nSrcByteOffset, eDataType, nBandDataSize,
     338             :                         static_cast<GByte *>(pData) +
     339     3628880 :                             static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace,
     340             :                         eBufType, static_cast<int>(nPixelSpace), nBufXSize);
     341             :                 else
     342       20557 :                     GDALCopyWords64(static_cast<GByte *>(pData) +
     343       20557 :                                         static_cast<GPtrDiff_t>(iBufYOff) *
     344             :                                             nLineSpace,
     345             :                                     eBufType, static_cast<int>(nPixelSpace),
     346       20557 :                                     pabySrcBlock + nSrcByteOffset, eDataType,
     347             :                                     nBandDataSize, nBufXSize);
     348             :             }
     349             : 
     350     5449590 :             if (psExtraArg->pfnProgress != nullptr &&
     351       59892 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     352             :                                          psExtraArg->pProgressData))
     353             :             {
     354           5 :                 eErr = CE_Failure;
     355           5 :                 break;
     356             :             }
     357             :         }
     358             : 
     359     2813100 :         if (poBlock)
     360     2811780 :             poBlock->DropLock();
     361             : 
     362     2812850 :         return eErr;
     363             :     }
     364             : 
     365             :     /* ==================================================================== */
     366             :     /*      Do we have overviews that would be appropriate to satisfy       */
     367             :     /*      this request?                                                   */
     368             :     /* ==================================================================== */
     369     2893890 :     if ((nBufXSize < nXSize || nBufYSize < nYSize) && GetOverviewCount() > 0 &&
     370             :         eRWFlag == GF_Read)
     371             :     {
     372             :         GDALRasterIOExtraArg sExtraArg;
     373        2832 :         GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
     374             : 
     375             :         const int nOverview =
     376        2832 :             GDALBandGetBestOverviewLevel2(this, nXOff, nYOff, nXSize, nYSize,
     377             :                                           nBufXSize, nBufYSize, &sExtraArg);
     378        2832 :         if (nOverview >= 0)
     379             :         {
     380        2812 :             GDALRasterBand *poOverviewBand = GetOverview(nOverview);
     381        2812 :             if (poOverviewBand == nullptr)
     382        2812 :                 return CE_Failure;
     383             : 
     384        2812 :             return poOverviewBand->RasterIO(
     385             :                 eRWFlag, nXOff, nYOff, nXSize, nYSize, pData, nBufXSize,
     386        2812 :                 nBufYSize, eBufType, nPixelSpace, nLineSpace, &sExtraArg);
     387             :         }
     388             :     }
     389             : 
     390      702432 :     if (eRWFlag == GF_Read && nBufXSize < nXSize / 100 &&
     391           0 :         nBufYSize < nYSize / 100 && nPixelSpace == nBufDataSize &&
     392     3593500 :         nLineSpace == nPixelSpace * nBufXSize &&
     393           0 :         CPLTestBool(CPLGetConfigOption("GDAL_NO_COSTLY_OVERVIEW", "NO")))
     394             :     {
     395           0 :         memset(pData, 0, static_cast<size_t>(nLineSpace * nBufYSize));
     396           0 :         return CE_None;
     397             :     }
     398             : 
     399             :     /* ==================================================================== */
     400             :     /*      The second case when we don't need subsample data but likely    */
     401             :     /*      need data type conversion.                                      */
     402             :     /* ==================================================================== */
     403     2891070 :     if (  // nPixelSpace == nBufDataSize &&
     404     2891070 :         nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
     405             :     {
     406             : #if DEBUG_VERBOSE
     407             :         printf("IRasterIO(%d,%d,%d,%d) rw=%d case 2\n", /*ok*/
     408             :                nXOff, nYOff, nXSize, nYSize, static_cast<int>(eRWFlag));
     409             : #endif
     410             : 
     411             :         /* --------------------------------------------------------------------
     412             :          */
     413             :         /*      Loop over buffer computing source locations. */
     414             :         /* --------------------------------------------------------------------
     415             :          */
     416             :         // Calculate starting values out of loop
     417     2528130 :         const int nLBlockXStart = nXOff / nBlockXSize;
     418     2528130 :         const int nXSpanEnd = nBufXSize + nXOff;
     419             : 
     420     2528130 :         int nYInc = 0;
     421     5090840 :         for (int iBufYOff = 0, iSrcY = nYOff; iBufYOff < nBufYSize;
     422     2562710 :              iBufYOff += nYInc, iSrcY += nYInc)
     423             :         {
     424     2562600 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     425             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     426     2562600 :             int nLBlockY = iSrcY / nBlockYSize;
     427     2562600 :             int nLBlockX = nLBlockXStart;
     428     2562600 :             int iSrcX = nXOff;
     429     5336860 :             while (iSrcX < nXSpanEnd)
     430             :             {
     431     2774040 :                 int nXSpan = nLBlockX * nBlockXSize;
     432     2774040 :                 if (nXSpan < INT_MAX - nBlockXSize)
     433     2773940 :                     nXSpan += nBlockXSize;
     434             :                 else
     435         101 :                     nXSpan = INT_MAX;
     436     2774040 :                 const int nXRight = nXSpan;
     437     2774040 :                 nXSpan = (nXSpan < nXSpanEnd ? nXSpan : nXSpanEnd) - iSrcX;
     438     2774040 :                 const size_t nXSpanSize =
     439     2774040 :                     nXSpan * static_cast<size_t>(nPixelSpace);
     440             : 
     441     2774040 :                 bool bJustInitialize =
     442     2042060 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
     443       37173 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
     444     4841680 :                     nXOff <= nLBlockX * nBlockXSize &&
     445       25572 :                     nXOff + nXSize >= nXRight;
     446             : 
     447             :                 // Is this a partial tile at right and/or bottom edges of
     448             :                 // the raster, and that is going to be completely written?
     449             :                 // If so, do not load it from storage, but zero it so that
     450             :                 // the content outsize of the validity area is initialized.
     451     2774040 :                 bool bMemZeroBuffer = false;
     452     2042060 :                 if (eRWFlag == GF_Write && !bJustInitialize &&
     453     2017740 :                     nXOff <= nLBlockX * nBlockXSize &&
     454     2016120 :                     nYOff <= nLBlockY * nBlockYSize &&
     455       12095 :                     (nXOff + nXSize >= nXRight ||
     456             :                      // cppcheck-suppress knownConditionTrueFalse
     457     4818790 :                      (nXOff + nXSize == GetXSize() && nXRight > GetXSize())) &&
     458       11917 :                     (nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize ||
     459       10678 :                      (nYOff + nYSize == GetYSize() &&
     460        1891 :                       nLBlockY * nBlockYSize > GetYSize() - nBlockYSize)))
     461             :                 {
     462        3130 :                     bJustInitialize = true;
     463        3130 :                     bMemZeroBuffer = true;
     464             :                 }
     465             : 
     466             :                 /* --------------------------------------------------------------------
     467             :                  */
     468             :                 /*      Ensure we have the appropriate block loaded. */
     469             :                 /* --------------------------------------------------------------------
     470             :                  */
     471     2774040 :                 const GUInt32 nErrorCounter = CPLGetErrorCounter();
     472     2774290 :                 poBlock =
     473     2773940 :                     GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     474     2774290 :                 if (!poBlock)
     475             :                 {
     476          74 :                     if (strstr(CPLGetLastErrorMsg(), "IReadBlock failed") ==
     477             :                         nullptr)
     478             :                     {
     479           0 :                         CPLError(CE_Failure, CPLE_AppDefined,
     480             :                                  "GetBlockRef failed at X block offset %d, "
     481             :                                  "Y block offset %d%s",
     482             :                                  nLBlockX, nLBlockY,
     483           0 :                                  (nErrorCounter != CPLGetErrorCounter())
     484           0 :                                      ? CPLSPrintf(": %s", CPLGetLastErrorMsg())
     485             :                                      : "");
     486             :                     }
     487          74 :                     return (CE_Failure);
     488             :                 }
     489             : 
     490     2774220 :                 if (eRWFlag == GF_Write)
     491     2042060 :                     poBlock->MarkDirty();
     492             : 
     493     2774220 :                 pabySrcBlock = static_cast<GByte *>(poBlock->GetDataRef());
     494     2774210 :                 if (bMemZeroBuffer)
     495             :                 {
     496        3130 :                     memset(pabySrcBlock, 0,
     497        3130 :                            static_cast<GPtrDiff_t>(nBandDataSize) *
     498        3130 :                                nBlockXSize * nBlockYSize);
     499             :                 }
     500             :                 /* --------------------------------------------------------------------
     501             :                  */
     502             :                 /*      Copy over this chunk of data. */
     503             :                 /* --------------------------------------------------------------------
     504             :                  */
     505     2774210 :                 GPtrDiff_t iSrcOffset =
     506     2774210 :                     (static_cast<GPtrDiff_t>(iSrcX) -
     507     2774210 :                      static_cast<GPtrDiff_t>(nLBlockX * nBlockXSize) +
     508     2774210 :                      (static_cast<GPtrDiff_t>(iSrcY) -
     509     2774210 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     510     2774210 :                          nBlockXSize) *
     511     2774210 :                     nBandDataSize;
     512             :                 // Fill up as many rows as possible for the loaded block.
     513     5548390 :                 const int kmax = std::min(nBlockYSize - (iSrcY % nBlockYSize),
     514     2774210 :                                           nBufYSize - iBufYOff);
     515    58577900 :                 for (int k = 0; k < kmax; k++)
     516             :                 {
     517    55803900 :                     if (eDataType == eBufType && nPixelSpace == nBufDataSize)
     518             :                     {
     519    51775200 :                         if (eRWFlag == GF_Read)
     520    47410200 :                             memcpy(static_cast<GByte *>(pData) + iBufOffset +
     521    47410200 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     522    47410200 :                                    pabySrcBlock + iSrcOffset, nXSpanSize);
     523             :                         else
     524     4365040 :                             memcpy(pabySrcBlock + iSrcOffset,
     525     4365040 :                                    static_cast<GByte *>(pData) + iBufOffset +
     526     4365040 :                                        static_cast<GPtrDiff_t>(k) * nLineSpace,
     527             :                                    nXSpanSize);
     528             :                     }
     529             :                     else
     530             :                     {
     531             :                         /* type to type conversion */
     532     4028700 :                         if (eRWFlag == GF_Read)
     533     3908520 :                             GDALCopyWords64(
     534     3908520 :                                 pabySrcBlock + iSrcOffset, eDataType,
     535             :                                 nBandDataSize,
     536     3908520 :                                 static_cast<GByte *>(pData) + iBufOffset +
     537     3908520 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     538             :                                 eBufType, static_cast<int>(nPixelSpace),
     539             :                                 nXSpan);
     540             :                         else
     541      120182 :                             GDALCopyWords64(
     542      120182 :                                 static_cast<GByte *>(pData) + iBufOffset +
     543      120182 :                                     static_cast<GPtrDiff_t>(k) * nLineSpace,
     544             :                                 eBufType, static_cast<int>(nPixelSpace),
     545      120182 :                                 pabySrcBlock + iSrcOffset, eDataType,
     546             :                                 nBandDataSize, nXSpan);
     547             :                     }
     548             : 
     549    55803800 :                     iSrcOffset +=
     550    55803800 :                         static_cast<GPtrDiff_t>(nBlockXSize) * nBandDataSize;
     551             :                 }
     552             : 
     553             :                 iBufOffset =
     554     2774020 :                     CPLUnsanitizedAdd<GPtrDiff_t>(iBufOffset, nXSpanSize);
     555     2774030 :                 nLBlockX++;
     556     2774030 :                 iSrcX += nXSpan;
     557             : 
     558     2774030 :                 poBlock->DropLock();
     559     2774260 :                 poBlock = nullptr;
     560             :             }
     561             : 
     562             :             /* Compute the increment to go on a block boundary */
     563     2562820 :             nYInc = nBlockYSize - (iSrcY % nBlockYSize);
     564             : 
     565     2564600 :             if (psExtraArg->pfnProgress != nullptr &&
     566        1784 :                 !psExtraArg->pfnProgress(
     567     2564600 :                     1.0 * std::min(nBufYSize, iBufYOff + nYInc) / nBufYSize, "",
     568             :                     psExtraArg->pProgressData))
     569             :             {
     570         100 :                 return CE_Failure;
     571             :             }
     572             :         }
     573             : 
     574     2528240 :         return CE_None;
     575             :     }
     576             : 
     577             :     /* ==================================================================== */
     578             :     /*      Loop reading required source blocks to satisfy output           */
     579             :     /*      request.  This is the most general implementation.              */
     580             :     /* ==================================================================== */
     581             : 
     582      362937 :     double dfXOff = nXOff;
     583      362937 :     double dfYOff = nYOff;
     584      362937 :     double dfXSize = nXSize;
     585      362937 :     double dfYSize = nYSize;
     586      362937 :     if (psExtraArg->bFloatingPointWindowValidity)
     587             :     {
     588       28159 :         dfXOff = psExtraArg->dfXOff;
     589       28159 :         dfYOff = psExtraArg->dfYOff;
     590       28159 :         dfXSize = psExtraArg->dfXSize;
     591       28159 :         dfYSize = psExtraArg->dfYSize;
     592             :     }
     593             : 
     594             :     /* -------------------------------------------------------------------- */
     595             :     /*      Compute stepping increment.                                     */
     596             :     /* -------------------------------------------------------------------- */
     597      362937 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
     598      362937 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
     599      362937 :     CPLErr eErr = CE_None;
     600             : 
     601      362937 :     if (eRWFlag == GF_Write)
     602             :     {
     603             :         /* --------------------------------------------------------------------
     604             :          */
     605             :         /*    Write case */
     606             :         /*    Loop over raster window computing source locations in the buffer.
     607             :          */
     608             :         /* --------------------------------------------------------------------
     609             :          */
     610      166650 :         GByte *pabyDstBlock = nullptr;
     611      166650 :         int nLBlockX = -1;
     612      166650 :         int nLBlockY = -1;
     613             : 
     614     1259590 :         for (int iDstY = nYOff; iDstY < nYOff + nYSize; iDstY++)
     615             :         {
     616     1092940 :             const int iBufYOff = static_cast<int>((iDstY - nYOff) / dfSrcYInc);
     617             : 
     618    12063600 :             for (int iDstX = nXOff; iDstX < nXOff + nXSize; iDstX++)
     619             :             {
     620    10970600 :                 const int iBufXOff =
     621    10970600 :                     static_cast<int>((iDstX - nXOff) / dfSrcXInc);
     622    10970600 :                 GPtrDiff_t iBufOffset =
     623    10970600 :                     static_cast<GPtrDiff_t>(iBufYOff) *
     624             :                         static_cast<GPtrDiff_t>(nLineSpace) +
     625    10970600 :                     iBufXOff * static_cast<GPtrDiff_t>(nPixelSpace);
     626             : 
     627             :                 // FIXME: this code likely doesn't work if the dirty block gets
     628             :                 // flushed to disk before being completely written.
     629             :                 // In the meantime, bJustInitialize should probably be set to
     630             :                 // FALSE even if it is not ideal performance wise, and for
     631             :                 // lossy compression.
     632             : 
     633             :                 /* --------------------------------------------------------------------
     634             :                  */
     635             :                 /*      Ensure we have the appropriate block loaded. */
     636             :                 /* --------------------------------------------------------------------
     637             :                  */
     638    10970600 :                 if (iDstX < nLBlockX * nBlockXSize ||
     639    10721300 :                     iDstX - nBlockXSize >= nLBlockX * nBlockXSize ||
     640    10264600 :                     iDstY < nLBlockY * nBlockYSize ||
     641    10264600 :                     iDstY - nBlockYSize >= nLBlockY * nBlockYSize)
     642             :                 {
     643      738642 :                     nLBlockX = iDstX / nBlockXSize;
     644      738642 :                     nLBlockY = iDstY / nBlockYSize;
     645             : 
     646      738642 :                     const bool bJustInitialize =
     647     1065870 :                         nYOff <= nLBlockY * nBlockYSize &&
     648      327231 :                         nYOff + nYSize - nBlockYSize >=
     649      327231 :                             nLBlockY * nBlockYSize &&
     650     1116140 :                         nXOff <= nLBlockX * nBlockXSize &&
     651       50265 :                         nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
     652             :                     /*bool bMemZeroBuffer = FALSE;
     653             :                     if( !bJustInitialize &&
     654             :                         nXOff <= nLBlockX * nBlockXSize &&
     655             :                         nYOff <= nLBlockY * nBlockYSize &&
     656             :                         (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
     657             :                          (nXOff + nXSize == GetXSize() &&
     658             :                          (nLBlockX+1) * nBlockXSize > GetXSize())) &&
     659             :                         (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
     660             :                          (nYOff + nYSize == GetYSize() &&
     661             :                          (nLBlockY+1) * nBlockYSize > GetYSize())) )
     662             :                     {
     663             :                         bJustInitialize = TRUE;
     664             :                         bMemZeroBuffer = TRUE;
     665             :                     }*/
     666      738642 :                     if (poBlock != nullptr)
     667      571992 :                         poBlock->DropLock();
     668             : 
     669      738642 :                     poBlock =
     670      738642 :                         GetLockedBlockRef(nLBlockX, nLBlockY, bJustInitialize);
     671      738642 :                     if (poBlock == nullptr)
     672             :                     {
     673           0 :                         return (CE_Failure);
     674             :                     }
     675             : 
     676      738642 :                     poBlock->MarkDirty();
     677             : 
     678      738642 :                     pabyDstBlock = static_cast<GByte *>(poBlock->GetDataRef());
     679             :                     /*if( bMemZeroBuffer )
     680             :                     {
     681             :                         memset(pabyDstBlock, 0,
     682             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
     683             :                     * nBlockYSize);
     684             :                     }*/
     685             :                 }
     686             : 
     687             :                 // To make Coverity happy. Should not happen by design.
     688    10970600 :                 if (pabyDstBlock == nullptr)
     689             :                 {
     690           0 :                     CPLAssert(false);
     691             :                     eErr = CE_Failure;
     692             :                     break;
     693             :                 }
     694             : 
     695             :                 /* --------------------------------------------------------------------
     696             :                  */
     697             :                 /*      Copy over this pixel of data. */
     698             :                 /* --------------------------------------------------------------------
     699             :                  */
     700    10970600 :                 GPtrDiff_t iDstOffset =
     701    10970600 :                     (static_cast<GPtrDiff_t>(iDstX) -
     702    10970600 :                      static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
     703    10970600 :                      (static_cast<GPtrDiff_t>(iDstY) -
     704    10970600 :                       static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
     705    10970600 :                          nBlockXSize) *
     706    10970600 :                     nBandDataSize;
     707             : 
     708    10970600 :                 if (eDataType == eBufType)
     709             :                 {
     710    10967500 :                     memcpy(pabyDstBlock + iDstOffset,
     711    10967500 :                            static_cast<GByte *>(pData) + iBufOffset,
     712             :                            nBandDataSize);
     713             :                 }
     714             :                 else
     715             :                 {
     716             :                     /* type to type conversion ... ouch, this is expensive way
     717             :                     of handling single words */
     718             : 
     719        3096 :                     GDALCopyWords64(static_cast<GByte *>(pData) + iBufOffset,
     720        3096 :                                     eBufType, 0, pabyDstBlock + iDstOffset,
     721             :                                     eDataType, 0, 1);
     722             :                 }
     723             :             }
     724             : 
     725     1092940 :             if (psExtraArg->pfnProgress != nullptr &&
     726           0 :                 !psExtraArg->pfnProgress(1.0 * (iDstY - nYOff + 1) / nYSize, "",
     727             :                                          psExtraArg->pProgressData))
     728             :             {
     729           0 :                 eErr = CE_Failure;
     730           0 :                 break;
     731             :             }
     732             :         }
     733             :     }
     734             :     else
     735             :     {
     736      196287 :         if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
     737             :         {
     738        7638 :             if ((psExtraArg->eResampleAlg == GRIORA_Cubic ||
     739        2496 :                  psExtraArg->eResampleAlg == GRIORA_CubicSpline ||
     740        2494 :                  psExtraArg->eResampleAlg == GRIORA_Bilinear ||
     741        5147 :                  psExtraArg->eResampleAlg == GRIORA_Lanczos) &&
     742        2465 :                 GetColorTable() != nullptr)
     743             :             {
     744           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     745             :                          "Resampling method not supported on paletted band. "
     746             :                          "Falling back to nearest neighbour");
     747             :             }
     748        2574 :             else if (psExtraArg->eResampleAlg == GRIORA_Gauss &&
     749           3 :                      GDALDataTypeIsComplex(eDataType))
     750             :             {
     751           0 :                 CPLError(CE_Warning, CPLE_NotSupported,
     752             :                          "Resampling method not supported on complex data type "
     753             :                          "band. Falling back to nearest neighbour");
     754             :             }
     755             :             else
     756             :             {
     757        2571 :                 return RasterIOResampled(eRWFlag, nXOff, nYOff, nXSize, nYSize,
     758             :                                          pData, nBufXSize, nBufYSize, eBufType,
     759        2571 :                                          nPixelSpace, nLineSpace, psExtraArg);
     760             :             }
     761             :         }
     762             : 
     763      193625 :         int nLimitBlockY = 0;
     764      193625 :         const bool bByteCopy = eDataType == eBufType && nBandDataSize == 1;
     765      193625 :         int nStartBlockX = -nBlockXSize;
     766      193625 :         const double EPS = 1e-10;
     767      193625 :         int nLBlockY = -1;
     768      193625 :         const double dfSrcXStart = 0.5 * dfSrcXInc + dfXOff + EPS;
     769      193625 :         const bool bIntegerXFactor =
     770      170978 :             bUseIntegerRequestCoords &&
     771      265612 :             static_cast<int>(dfSrcXInc) == dfSrcXInc &&
     772       71987 :             static_cast<int>(dfSrcXInc) < INT_MAX / nBandDataSize;
     773             : 
     774             :         /* --------------------------------------------------------------------
     775             :          */
     776             :         /*      Read case */
     777             :         /*      Loop over buffer computing source locations. */
     778             :         /* --------------------------------------------------------------------
     779             :          */
     780     1945910 :         for (int iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
     781             :         {
     782             :             // Add small epsilon to avoid some numeric precision issues.
     783     1752300 :             const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
     784     1752300 :             const int iSrcY = static_cast<int>(std::min(
     785     1752300 :                 std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
     786             : 
     787     1752300 :             GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
     788             :                                     static_cast<GPtrDiff_t>(nLineSpace);
     789             : 
     790     1752300 :             if (iSrcY >= nLimitBlockY)
     791             :             {
     792      234842 :                 nLBlockY = iSrcY / nBlockYSize;
     793      234842 :                 nLimitBlockY = nLBlockY * nBlockYSize;
     794      234842 :                 if (nLimitBlockY < INT_MAX - nBlockYSize)
     795      234842 :                     nLimitBlockY += nBlockYSize;
     796             :                 else
     797           0 :                     nLimitBlockY = INT_MAX;
     798             :                 // Make sure a new block is loaded.
     799      234842 :                 nStartBlockX = -nBlockXSize;
     800             :             }
     801     1517450 :             else if (static_cast<int>(dfSrcXStart) < nStartBlockX)
     802             :             {
     803             :                 // Make sure a new block is loaded.
     804      429795 :                 nStartBlockX = -nBlockXSize;
     805             :             }
     806             : 
     807     1752300 :             GPtrDiff_t iSrcOffsetCst = (iSrcY - nLBlockY * nBlockYSize) *
     808     1752300 :                                        static_cast<GPtrDiff_t>(nBlockXSize);
     809             : 
     810     1752300 :             if (bIntegerXFactor)
     811             :             {
     812      413236 :                 int iSrcX = static_cast<int>(dfSrcXStart);
     813      413236 :                 const int nSrcXInc = static_cast<int>(dfSrcXInc);
     814      413236 :                 GByte *pabyDstData = static_cast<GByte *>(pData) + iBufOffset;
     815      413236 :                 bool bRet = false;
     816      413236 :                 if (bByteCopy)
     817             :                 {
     818      302849 :                     bRet = DownsamplingIntegerXFactor<true, 1>(
     819             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     820             :                         static_cast<int>(nPixelSpace), nBufXSize, GDT_Byte,
     821             :                         GDT_Byte, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     822             :                 }
     823      110387 :                 else if (eDataType == eBufType)
     824             :                 {
     825      110182 :                     switch (nBandDataSize)
     826             :                     {
     827      110102 :                         case 2:
     828      110102 :                             bRet = DownsamplingIntegerXFactor<true, 2>(
     829             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     830             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     831             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     832             :                                 nBlockXSize, poBlock, nLBlockY);
     833      110102 :                             break;
     834          22 :                         case 4:
     835          22 :                             bRet = DownsamplingIntegerXFactor<true, 4>(
     836             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     837             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     838             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     839             :                                 nBlockXSize, poBlock, nLBlockY);
     840          22 :                             break;
     841          56 :                         case 8:
     842          56 :                             bRet = DownsamplingIntegerXFactor<true, 8>(
     843             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     844             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     845             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     846             :                                 nBlockXSize, poBlock, nLBlockY);
     847          56 :                             break;
     848           2 :                         case 16:
     849           2 :                             bRet = DownsamplingIntegerXFactor<true, 16>(
     850             :                                 this, iSrcX, nSrcXInc, iSrcOffsetCst,
     851             :                                 pabyDstData, static_cast<int>(nPixelSpace),
     852             :                                 nBufXSize, eDataType, eDataType, nStartBlockX,
     853             :                                 nBlockXSize, poBlock, nLBlockY);
     854           2 :                             break;
     855           0 :                         default:
     856           0 :                             CPLAssert(false);
     857             :                             break;
     858             :                     }
     859             :                 }
     860             :                 else
     861             :                 {
     862         205 :                     bRet = DownsamplingIntegerXFactor<false, 0>(
     863             :                         this, iSrcX, nSrcXInc, iSrcOffsetCst, pabyDstData,
     864             :                         static_cast<int>(nPixelSpace), nBufXSize, eDataType,
     865             :                         eBufType, nStartBlockX, nBlockXSize, poBlock, nLBlockY);
     866             :                 }
     867      413236 :                 if (!bRet)
     868           1 :                     eErr = CE_Failure;
     869             :             }
     870             :             else
     871             :             {
     872     1339060 :                 double dfSrcX = dfSrcXStart;
     873   560923000 :                 for (int iBufXOff = 0; iBufXOff < nBufXSize;
     874   559584000 :                      iBufXOff++, dfSrcX += dfSrcXInc)
     875             :                 {
     876             :                     // TODO?: try to avoid the clamping for most iterations
     877             :                     const int iSrcX = static_cast<int>(
     878  1119170000 :                         std::min(std::max(0.0, dfSrcX),
     879   559584000 :                                  static_cast<double>(nRasterXSize - 1)));
     880             : 
     881             :                     /* --------------------------------------------------------------------
     882             :                      */
     883             :                     /*      Ensure we have the appropriate block loaded. */
     884             :                     /* --------------------------------------------------------------------
     885             :                      */
     886   559584000 :                     if (iSrcX >= nBlockXSize + nStartBlockX)
     887             :                     {
     888     1705400 :                         const int nLBlockX = iSrcX / nBlockXSize;
     889     1705400 :                         nStartBlockX = nLBlockX * nBlockXSize;
     890             : 
     891     1705400 :                         if (poBlock != nullptr)
     892     1583760 :                             poBlock->DropLock();
     893             : 
     894     1705400 :                         poBlock = GetLockedBlockRef(nLBlockX, nLBlockY, FALSE);
     895     1705400 :                         if (poBlock == nullptr)
     896             :                         {
     897           9 :                             eErr = CE_Failure;
     898           9 :                             break;
     899             :                         }
     900             : 
     901             :                         pabySrcBlock =
     902     1705390 :                             static_cast<GByte *>(poBlock->GetDataRef());
     903             :                     }
     904   559584000 :                     const GPtrDiff_t nDiffX =
     905   559584000 :                         static_cast<GPtrDiff_t>(iSrcX - nStartBlockX);
     906             : 
     907             :                     /* --------------------------------------------------------------------
     908             :                      */
     909             :                     /*      Copy over this pixel of data. */
     910             :                     /* --------------------------------------------------------------------
     911             :                      */
     912             : 
     913   559584000 :                     if (bByteCopy)
     914             :                     {
     915   506145000 :                         GPtrDiff_t iSrcOffset = nDiffX + iSrcOffsetCst;
     916   506145000 :                         static_cast<GByte *>(pData)[iBufOffset] =
     917   506145000 :                             pabySrcBlock[iSrcOffset];
     918             :                     }
     919    53439100 :                     else if (eDataType == eBufType)
     920             :                     {
     921    48225500 :                         GPtrDiff_t iSrcOffset =
     922    48225500 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     923    48225500 :                         memcpy(static_cast<GByte *>(pData) + iBufOffset,
     924    48225500 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
     925             :                     }
     926             :                     else
     927             :                     {
     928             :                         // Type to type conversion ...
     929     5213610 :                         GPtrDiff_t iSrcOffset =
     930     5213610 :                             (nDiffX + iSrcOffsetCst) * nBandDataSize;
     931     5213610 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
     932             :                                         static_cast<GByte *>(pData) +
     933     5213610 :                                             iBufOffset,
     934             :                                         eBufType, 0, 1);
     935             :                     }
     936             : 
     937   559584000 :                     iBufOffset += static_cast<int>(nPixelSpace);
     938             :                 }
     939             :             }
     940     1752300 :             if (eErr == CE_Failure)
     941          11 :                 break;
     942             : 
     943     1963450 :             if (psExtraArg->pfnProgress != nullptr &&
     944      211166 :                 !psExtraArg->pfnProgress(1.0 * (iBufYOff + 1) / nBufYSize, "",
     945             :                                          psExtraArg->pProgressData))
     946             :             {
     947           1 :                 eErr = CE_Failure;
     948           1 :                 break;
     949             :             }
     950             :         }
     951             :     }
     952             : 
     953      360275 :     if (poBlock != nullptr)
     954      360265 :         poBlock->DropLock();
     955             : 
     956      360275 :     return eErr;
     957             : }
     958             : 
     959             : /************************************************************************/
     960             : /*                         GDALRasterIOTransformer()                    */
     961             : /************************************************************************/
     962             : 
     963             : struct GDALRasterIOTransformerStruct
     964             : {
     965             :     double dfXOff;
     966             :     double dfYOff;
     967             :     double dfXRatioDstToSrc;
     968             :     double dfYRatioDstToSrc;
     969             : };
     970             : 
     971        6748 : static int GDALRasterIOTransformer(void *pTransformerArg, int bDstToSrc,
     972             :                                    int nPointCount, double *x, double *y,
     973             :                                    double * /* z */, int *panSuccess)
     974             : {
     975        6748 :     GDALRasterIOTransformerStruct *psParams =
     976             :         static_cast<GDALRasterIOTransformerStruct *>(pTransformerArg);
     977        6748 :     if (bDstToSrc)
     978             :     {
     979      252996 :         for (int i = 0; i < nPointCount; i++)
     980             :         {
     981      246836 :             x[i] = x[i] * psParams->dfXRatioDstToSrc + psParams->dfXOff;
     982      246836 :             y[i] = y[i] * psParams->dfYRatioDstToSrc + psParams->dfYOff;
     983      246836 :             panSuccess[i] = TRUE;
     984             :         }
     985             :     }
     986             :     else
     987             :     {
     988        1176 :         for (int i = 0; i < nPointCount; i++)
     989             :         {
     990         588 :             x[i] = (x[i] - psParams->dfXOff) / psParams->dfXRatioDstToSrc;
     991         588 :             y[i] = (y[i] - psParams->dfYOff) / psParams->dfYRatioDstToSrc;
     992         588 :             panSuccess[i] = TRUE;
     993             :         }
     994             :     }
     995        6748 :     return TRUE;
     996             : }
     997             : 
     998             : /************************************************************************/
     999             : /*                          RasterIOResampled()                         */
    1000             : /************************************************************************/
    1001             : 
    1002             : //! @cond Doxygen_Suppress
    1003        2571 : CPLErr GDALRasterBand::RasterIOResampled(
    1004             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1005             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1006             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    1007             : {
    1008             :     // Determine if we use warping resampling or overview resampling
    1009             :     const bool bUseWarp =
    1010        2571 :         (GDALDataTypeIsComplex(eDataType) &&
    1011        2728 :          psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    1012         157 :          psExtraArg->eResampleAlg != GRIORA_Mode);
    1013             : 
    1014        2571 :     double dfXOff = nXOff;
    1015        2571 :     double dfYOff = nYOff;
    1016        2571 :     double dfXSize = nXSize;
    1017        2571 :     double dfYSize = nYSize;
    1018        2571 :     if (psExtraArg->bFloatingPointWindowValidity)
    1019             :     {
    1020        2114 :         dfXOff = psExtraArg->dfXOff;
    1021        2114 :         dfYOff = psExtraArg->dfYOff;
    1022        2114 :         dfXSize = psExtraArg->dfXSize;
    1023        2114 :         dfYSize = psExtraArg->dfYSize;
    1024             :     }
    1025             : 
    1026        2571 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1027        2571 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1028             : 
    1029             :     // Determine the coordinates in the "virtual" output raster to see
    1030             :     // if there are not integers, in which case we will use them as a shift
    1031             :     // so that subwindow extracts give the exact same results as entire raster
    1032             :     // scaling.
    1033        2571 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1034        2571 :     bool bHasXOffVirtual = false;
    1035        2571 :     int nDestXOffVirtual = 0;
    1036        2571 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1037             :     {
    1038        2245 :         bHasXOffVirtual = true;
    1039        2245 :         dfXOff = nXOff;
    1040        2245 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1041             :     }
    1042             : 
    1043        2571 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1044        2571 :     bool bHasYOffVirtual = false;
    1045        2571 :     int nDestYOffVirtual = 0;
    1046        2571 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1047             :     {
    1048        2239 :         bHasYOffVirtual = true;
    1049        2239 :         dfYOff = nYOff;
    1050        2239 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1051             :     }
    1052             : 
    1053             :     // Create a MEM dataset that wraps the output buffer.
    1054             :     GDALDataset *poMEMDS;
    1055        2571 :     void *pTempBuffer = nullptr;
    1056        2571 :     GSpacing nPSMem = nPixelSpace;
    1057        2571 :     GSpacing nLSMem = nLineSpace;
    1058        2571 :     void *pDataMem = pData;
    1059        2571 :     GDALDataType eDTMem = eBufType;
    1060        2571 :     if (eBufType != eDataType)
    1061             :     {
    1062          40 :         nPSMem = GDALGetDataTypeSizeBytes(eDataType);
    1063          40 :         nLSMem = nPSMem * nBufXSize;
    1064             :         pTempBuffer =
    1065          40 :             VSI_MALLOC2_VERBOSE(nBufYSize, static_cast<size_t>(nLSMem));
    1066          40 :         if (pTempBuffer == nullptr)
    1067           0 :             return CE_Failure;
    1068          40 :         pDataMem = pTempBuffer;
    1069          40 :         eDTMem = eDataType;
    1070             :     }
    1071             : 
    1072             :     poMEMDS =
    1073        2571 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1074             :                            nDestYOffVirtual + nBufYSize, 0, eDTMem, nullptr);
    1075        2571 :     GByte *pabyData = static_cast<GByte *>(pDataMem) -
    1076        2571 :                       nPSMem * nDestXOffVirtual - nLSMem * nDestYOffVirtual;
    1077        2571 :     GDALRasterBandH hMEMBand = MEMCreateRasterBandEx(
    1078             :         poMEMDS, 1, pabyData, eDTMem, nPSMem, nLSMem, false);
    1079        2571 :     poMEMDS->SetBand(1, GDALRasterBand::FromHandle(hMEMBand));
    1080             : 
    1081        2571 :     const char *pszNBITS = GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1082        2571 :     const int nNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    1083        2571 :     if (pszNBITS)
    1084           6 :         GDALRasterBand::FromHandle(hMEMBand)->SetMetadataItem(
    1085           6 :             "NBITS", pszNBITS, "IMAGE_STRUCTURE");
    1086             : 
    1087        2571 :     CPLErr eErr = CE_None;
    1088             : 
    1089             :     // Do the resampling.
    1090        2571 :     if (bUseWarp)
    1091             :     {
    1092         149 :         int bHasNoData = FALSE;
    1093         149 :         double dfNoDataValue = GetNoDataValue(&bHasNoData);
    1094             : 
    1095         149 :         VRTDatasetH hVRTDS = nullptr;
    1096         149 :         GDALRasterBandH hVRTBand = nullptr;
    1097         149 :         if (GetDataset() == nullptr)
    1098             :         {
    1099             :             /* Create VRT dataset that wraps the whole dataset */
    1100           0 :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1101           0 :             VRTAddBand(hVRTDS, eDataType, nullptr);
    1102           0 :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1103           0 :             VRTAddSimpleSource(hVRTBand, this, 0, 0, nRasterXSize, nRasterYSize,
    1104             :                                0, 0, nRasterXSize, nRasterYSize, nullptr,
    1105             :                                VRT_NODATA_UNSET);
    1106             : 
    1107             :             /* Add a mask band if needed */
    1108           0 :             if (GetMaskFlags() != GMF_ALL_VALID)
    1109             :             {
    1110           0 :                 GDALDataset::FromHandle(hVRTDS)->CreateMaskBand(0);
    1111             :                 VRTSourcedRasterBand *poVRTMaskBand =
    1112             :                     reinterpret_cast<VRTSourcedRasterBand *>(
    1113             :                         reinterpret_cast<GDALRasterBand *>(hVRTBand)
    1114           0 :                             ->GetMaskBand());
    1115           0 :                 poVRTMaskBand->AddMaskBandSource(this, 0, 0, nRasterXSize,
    1116           0 :                                                  nRasterYSize, 0, 0,
    1117           0 :                                                  nRasterXSize, nRasterYSize);
    1118             :             }
    1119             :         }
    1120             : 
    1121         149 :         GDALWarpOptions *psWarpOptions = GDALCreateWarpOptions();
    1122         149 :         switch (psExtraArg->eResampleAlg)
    1123             :         {
    1124           0 :             case GRIORA_NearestNeighbour:
    1125           0 :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1126           0 :                 break;
    1127         147 :             case GRIORA_Bilinear:
    1128         147 :                 psWarpOptions->eResampleAlg = GRA_Bilinear;
    1129         147 :                 break;
    1130           0 :             case GRIORA_Cubic:
    1131           0 :                 psWarpOptions->eResampleAlg = GRA_Cubic;
    1132           0 :                 break;
    1133           0 :             case GRIORA_CubicSpline:
    1134           0 :                 psWarpOptions->eResampleAlg = GRA_CubicSpline;
    1135           0 :                 break;
    1136           0 :             case GRIORA_Lanczos:
    1137           0 :                 psWarpOptions->eResampleAlg = GRA_Lanczos;
    1138           0 :                 break;
    1139           0 :             case GRIORA_Average:
    1140           0 :                 psWarpOptions->eResampleAlg = GRA_Average;
    1141           0 :                 break;
    1142           2 :             case GRIORA_RMS:
    1143           2 :                 psWarpOptions->eResampleAlg = GRA_RMS;
    1144           2 :                 break;
    1145           0 :             case GRIORA_Mode:
    1146           0 :                 psWarpOptions->eResampleAlg = GRA_Mode;
    1147           0 :                 break;
    1148           0 :             default:
    1149           0 :                 CPLAssert(false);
    1150             :                 psWarpOptions->eResampleAlg = GRA_NearestNeighbour;
    1151             :                 break;
    1152             :         }
    1153         149 :         psWarpOptions->hSrcDS = hVRTDS ? hVRTDS : GetDataset();
    1154         149 :         psWarpOptions->hDstDS = poMEMDS;
    1155         149 :         psWarpOptions->nBandCount = 1;
    1156         149 :         int nSrcBandNumber = hVRTDS ? 1 : nBand;
    1157         149 :         int nDstBandNumber = 1;
    1158         149 :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1159         149 :         psWarpOptions->panDstBands = &nDstBandNumber;
    1160         298 :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress
    1161         149 :                                          ? psExtraArg->pfnProgress
    1162             :                                          : GDALDummyProgress;
    1163         149 :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1164         149 :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1165         149 :         if (bHasNoData)
    1166             :         {
    1167           0 :             psWarpOptions->papszWarpOptions = CSLSetNameValue(
    1168             :                 psWarpOptions->papszWarpOptions, "INIT_DEST", "NO_DATA");
    1169           0 :             if (psWarpOptions->padfSrcNoDataReal == nullptr)
    1170             :             {
    1171           0 :                 psWarpOptions->padfSrcNoDataReal =
    1172           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1173           0 :                 psWarpOptions->padfSrcNoDataReal[0] = dfNoDataValue;
    1174             :             }
    1175             : 
    1176           0 :             if (psWarpOptions->padfDstNoDataReal == nullptr)
    1177             :             {
    1178           0 :                 psWarpOptions->padfDstNoDataReal =
    1179           0 :                     static_cast<double *>(CPLMalloc(sizeof(double)));
    1180           0 :                 psWarpOptions->padfDstNoDataReal[0] = dfNoDataValue;
    1181             :             }
    1182             :         }
    1183             : 
    1184             :         GDALRasterIOTransformerStruct sTransformer;
    1185         149 :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1186         149 :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1187         149 :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1188         149 :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1189         149 :         psWarpOptions->pTransformerArg = &sTransformer;
    1190             : 
    1191             :         GDALWarpOperationH hWarpOperation =
    1192         149 :             GDALCreateWarpOperation(psWarpOptions);
    1193         149 :         eErr = GDALChunkAndWarpImage(hWarpOperation, nDestXOffVirtual,
    1194             :                                      nDestYOffVirtual, nBufXSize, nBufYSize);
    1195         149 :         GDALDestroyWarpOperation(hWarpOperation);
    1196             : 
    1197         149 :         psWarpOptions->panSrcBands = nullptr;
    1198         149 :         psWarpOptions->panDstBands = nullptr;
    1199         149 :         GDALDestroyWarpOptions(psWarpOptions);
    1200             : 
    1201         149 :         if (hVRTDS)
    1202           0 :             GDALClose(hVRTDS);
    1203             :     }
    1204             :     else
    1205             :     {
    1206        2422 :         const char *pszResampling =
    1207        2608 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1208         297 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1209         220 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1210         213 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1211         159 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1212          95 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1213          43 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1214           3 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1215             :                                                                : "UNKNOWN";
    1216             : 
    1217        2422 :         int nKernelRadius = 0;
    1218             :         GDALResampleFunction pfnResampleFunc =
    1219        2422 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1220        2422 :         CPLAssert(pfnResampleFunc);
    1221             :         GDALDataType eWrkDataType =
    1222        2422 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1223        2422 :         int nHasNoData = 0;
    1224        2422 :         double dfNoDataValue = GetNoDataValue(&nHasNoData);
    1225        2422 :         const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    1226        2422 :         if (!bHasNoData)
    1227        2358 :             dfNoDataValue = 0.0;
    1228             : 
    1229        2422 :         int nDstBlockXSize = nBufXSize;
    1230        2422 :         int nDstBlockYSize = nBufYSize;
    1231        2422 :         int nFullResXChunk = 0;
    1232        2422 :         int nFullResYChunk = 0;
    1233             :         while (true)
    1234             :         {
    1235        2422 :             nFullResXChunk =
    1236        2422 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1237        2422 :             nFullResYChunk =
    1238        2422 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1239        2422 :             if (nFullResXChunk > nRasterXSize)
    1240        2233 :                 nFullResXChunk = nRasterXSize;
    1241        2422 :             if (nFullResYChunk > nRasterYSize)
    1242         216 :                 nFullResYChunk = nRasterYSize;
    1243        2422 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1244        2376 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1245             :                  1024 * 1024))
    1246             :                 break;
    1247             :             // When operating on the full width of a raster whose block width is
    1248             :             // the raster width, prefer doing chunks in height.
    1249           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1250             :                 nDstBlockYSize > 1)
    1251           0 :                 nDstBlockYSize /= 2;
    1252             :             /* Otherwise cut the maximal dimension */
    1253           0 :             else if (nDstBlockXSize > 1 &&
    1254           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1255           0 :                 nDstBlockXSize /= 2;
    1256             :             else
    1257           0 :                 nDstBlockYSize /= 2;
    1258             :         }
    1259             : 
    1260        2422 :         int nOvrXFactor = static_cast<int>(0.5 + dfXRatioDstToSrc);
    1261        2422 :         int nOvrYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1262        2422 :         if (nOvrXFactor == 0)
    1263        2024 :             nOvrXFactor = 1;
    1264        2422 :         if (nOvrYFactor == 0)
    1265        2023 :             nOvrYFactor = 1;
    1266        2422 :         int nFullResXSizeQueried =
    1267        2422 :             nFullResXChunk + 2 * nKernelRadius * nOvrXFactor;
    1268        2422 :         int nFullResYSizeQueried =
    1269        2422 :             nFullResYChunk + 2 * nKernelRadius * nOvrYFactor;
    1270             : 
    1271        2422 :         if (nFullResXSizeQueried > nRasterXSize)
    1272        2135 :             nFullResXSizeQueried = nRasterXSize;
    1273        2422 :         if (nFullResYSizeQueried > nRasterYSize)
    1274         129 :             nFullResYSizeQueried = nRasterYSize;
    1275             : 
    1276             :         void *pChunk =
    1277        2422 :             VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    1278             :                                 nFullResXSizeQueried, nFullResYSizeQueried);
    1279        2422 :         GByte *pabyChunkNoDataMask = nullptr;
    1280             : 
    1281        2422 :         GDALRasterBand *poMaskBand = GetMaskBand();
    1282        2422 :         int l_nMaskFlags = GetMaskFlags();
    1283             : 
    1284        2422 :         bool bUseNoDataMask = ((l_nMaskFlags & GMF_ALL_VALID) == 0);
    1285        2422 :         if (bUseNoDataMask)
    1286             :         {
    1287         126 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1288             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1289             :         }
    1290        2422 :         if (pChunk == nullptr ||
    1291         126 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1292             :         {
    1293           0 :             GDALClose(poMEMDS);
    1294           0 :             CPLFree(pChunk);
    1295           0 :             CPLFree(pabyChunkNoDataMask);
    1296           0 :             VSIFree(pTempBuffer);
    1297           0 :             return CE_Failure;
    1298             :         }
    1299             : 
    1300        2422 :         int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
    1301        2422 :                            ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
    1302        2422 :         int nBlocksDone = 0;
    1303             : 
    1304             :         int nDstYOff;
    1305        4844 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1306        2422 :              nDstYOff += nDstBlockYSize)
    1307             :         {
    1308             :             int nDstYCount;
    1309        2422 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1310        2422 :                 nDstYCount = nDstBlockYSize;
    1311             :             else
    1312           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1313             : 
    1314        2422 :             int nChunkYOff =
    1315        2422 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1316        2422 :             int nChunkYOff2 = nYOff + 1 +
    1317        2422 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1318             :                                                     dfYRatioDstToSrc));
    1319        2422 :             if (nChunkYOff2 > nRasterYSize)
    1320         323 :                 nChunkYOff2 = nRasterYSize;
    1321        2422 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1322        2422 :             CPLAssert(nYCount <= nFullResYChunk);
    1323             : 
    1324        2422 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrYFactor;
    1325        2422 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrYFactor;
    1326        2422 :             if (nChunkYOffQueried < 0)
    1327             :             {
    1328         231 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1329         231 :                 nChunkYOffQueried = 0;
    1330             :             }
    1331        2422 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1332         331 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1333        2422 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1334             : 
    1335        2422 :             int nDstXOff = 0;
    1336        4844 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1337        2422 :                  nDstXOff += nDstBlockXSize)
    1338             :             {
    1339        2422 :                 int nDstXCount = 0;
    1340        2422 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1341        2422 :                     nDstXCount = nDstBlockXSize;
    1342             :                 else
    1343           0 :                     nDstXCount = nBufXSize - nDstXOff;
    1344             : 
    1345        2422 :                 int nChunkXOff =
    1346        2422 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1347        2422 :                 int nChunkXOff2 =
    1348        2422 :                     nXOff + 1 +
    1349        2422 :                     static_cast<int>(
    1350        2422 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1351        2422 :                 if (nChunkXOff2 > nRasterXSize)
    1352        2234 :                     nChunkXOff2 = nRasterXSize;
    1353        2422 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1354        2422 :                 CPLAssert(nXCount <= nFullResXChunk);
    1355             : 
    1356        2422 :                 int nChunkXOffQueried =
    1357        2422 :                     nChunkXOff - nKernelRadius * nOvrXFactor;
    1358        2422 :                 int nChunkXSizeQueried =
    1359        2422 :                     nXCount + 2 * nKernelRadius * nOvrXFactor;
    1360        2422 :                 if (nChunkXOffQueried < 0)
    1361             :                 {
    1362        2148 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1363        2148 :                     nChunkXOffQueried = 0;
    1364             :                 }
    1365        2422 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1366        2134 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1367        2422 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1368             : 
    1369             :                 // Read the source buffers.
    1370        2422 :                 eErr = RasterIO(GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1371             :                                 nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1372             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    1373             :                                 eWrkDataType, 0, 0, nullptr);
    1374             : 
    1375        2422 :                 bool bSkipResample = false;
    1376        2422 :                 bool bNoDataMaskFullyOpaque = false;
    1377        2422 :                 if (eErr == CE_None && bUseNoDataMask)
    1378             :                 {
    1379         126 :                     eErr = poMaskBand->RasterIO(
    1380             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1381             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1382             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1383             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1384             : 
    1385             :                     /* Optimizations if mask if fully opaque or transparent */
    1386         126 :                     int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1387         126 :                     GByte bVal = pabyChunkNoDataMask[0];
    1388         126 :                     int i = 1;
    1389      241310 :                     for (; i < nPixels; i++)
    1390             :                     {
    1391      241261 :                         if (pabyChunkNoDataMask[i] != bVal)
    1392          77 :                             break;
    1393             :                     }
    1394         126 :                     if (i == nPixels)
    1395             :                     {
    1396          49 :                         if (bVal == 0)
    1397             :                         {
    1398         712 :                             for (int j = 0; j < nDstYCount; j++)
    1399             :                             {
    1400         686 :                                 GDALCopyWords64(&dfNoDataValue, GDT_Float64, 0,
    1401             :                                                 static_cast<GByte *>(pDataMem) +
    1402         686 :                                                     nLSMem * (j + nDstYOff) +
    1403         686 :                                                     nDstXOff * nPSMem,
    1404             :                                                 eDTMem,
    1405             :                                                 static_cast<int>(nPSMem),
    1406             :                                                 nDstXCount);
    1407             :                             }
    1408          26 :                             bSkipResample = true;
    1409             :                         }
    1410             :                         else
    1411             :                         {
    1412          23 :                             bNoDataMaskFullyOpaque = true;
    1413             :                         }
    1414             :                     }
    1415             :                 }
    1416             : 
    1417        2422 :                 if (!bSkipResample && eErr == CE_None)
    1418             :                 {
    1419        2394 :                     const bool bPropagateNoData = false;
    1420        2394 :                     void *pDstBuffer = nullptr;
    1421        2394 :                     GDALDataType eDstBufferDataType = GDT_Unknown;
    1422             :                     GDALRasterBand *poMEMBand =
    1423        2394 :                         GDALRasterBand::FromHandle(hMEMBand);
    1424        2394 :                     GDALOverviewResampleArgs args;
    1425        2394 :                     args.eSrcDataType = eDataType;
    1426        2394 :                     args.eOvrDataType = poMEMBand->GetRasterDataType();
    1427        2394 :                     args.nOvrXSize = poMEMBand->GetXSize();
    1428        2394 :                     args.nOvrYSize = poMEMBand->GetYSize();
    1429        2394 :                     args.nOvrNBITS = nNBITS;
    1430        2394 :                     args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1431        2394 :                     args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1432        2394 :                     args.dfSrcXDelta =
    1433        2394 :                         dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1434        2394 :                     args.dfSrcYDelta =
    1435        2394 :                         dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1436        2394 :                     args.eWrkDataType = eWrkDataType;
    1437        2394 :                     args.pabyChunkNodataMask =
    1438        2394 :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask;
    1439        2394 :                     args.nChunkXOff =
    1440        2394 :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1441        2394 :                     args.nChunkXSize = nChunkXSizeQueried;
    1442        2394 :                     args.nChunkYOff =
    1443        2394 :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1444        2394 :                     args.nChunkYSize = nChunkYSizeQueried;
    1445        2394 :                     args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1446        2394 :                     args.nDstXOff2 = nDstXOff + nDestXOffVirtual + nDstXCount;
    1447        2394 :                     args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1448        2394 :                     args.nDstYOff2 = nDstYOff + nDestYOffVirtual + nDstYCount;
    1449        2394 :                     args.pszResampling = pszResampling;
    1450        2394 :                     args.bHasNoData = bHasNoData;
    1451        2394 :                     args.dfNoDataValue = dfNoDataValue;
    1452        2394 :                     args.poColorTable = GetColorTable();
    1453        2394 :                     args.bPropagateNoData = bPropagateNoData;
    1454        2394 :                     eErr = pfnResampleFunc(args, pChunk, &pDstBuffer,
    1455             :                                            &eDstBufferDataType);
    1456        2394 :                     if (eErr == CE_None)
    1457             :                     {
    1458        2394 :                         eErr = poMEMBand->RasterIO(
    1459             :                             GF_Write, nDstXOff + nDestXOffVirtual,
    1460             :                             nDstYOff + nDestYOffVirtual, nDstXCount, nDstYCount,
    1461             :                             pDstBuffer, nDstXCount, nDstYCount,
    1462             :                             eDstBufferDataType, 0, 0, nullptr);
    1463             :                     }
    1464        2394 :                     CPLFree(pDstBuffer);
    1465             :                 }
    1466             : 
    1467        2422 :                 nBlocksDone++;
    1468        2451 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1469          29 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1470             :                                              "", psExtraArg->pProgressData))
    1471             :                 {
    1472           1 :                     eErr = CE_Failure;
    1473             :                 }
    1474             :             }
    1475             :         }
    1476             : 
    1477        2422 :         CPLFree(pChunk);
    1478        2422 :         CPLFree(pabyChunkNoDataMask);
    1479             :     }
    1480             : 
    1481        2571 :     if (eBufType != eDataType)
    1482             :     {
    1483          40 :         CPL_IGNORE_RET_VAL(poMEMDS->GetRasterBand(1)->RasterIO(
    1484             :             GF_Read, nDestXOffVirtual, nDestYOffVirtual, nBufXSize, nBufYSize,
    1485             :             pData, nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    1486             :             nullptr));
    1487             :     }
    1488        2571 :     GDALClose(poMEMDS);
    1489        2571 :     VSIFree(pTempBuffer);
    1490             : 
    1491        2571 :     return eErr;
    1492             : }
    1493             : 
    1494             : /************************************************************************/
    1495             : /*                          RasterIOResampled()                         */
    1496             : /************************************************************************/
    1497             : 
    1498         284 : CPLErr GDALDataset::RasterIOResampled(
    1499             :     GDALRWFlag /* eRWFlag */, int nXOff, int nYOff, int nXSize, int nYSize,
    1500             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    1501             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    1502             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    1503             : 
    1504             : {
    1505             : #if 0
    1506             :     // Determine if we use warping resampling or overview resampling
    1507             :     bool bUseWarp = false;
    1508             :     if( GDALDataTypeIsComplex( eDataType ) )
    1509             :         bUseWarp = true;
    1510             : #endif
    1511             : 
    1512         284 :     double dfXOff = nXOff;
    1513         284 :     double dfYOff = nYOff;
    1514         284 :     double dfXSize = nXSize;
    1515         284 :     double dfYSize = nYSize;
    1516         284 :     if (psExtraArg->bFloatingPointWindowValidity)
    1517             :     {
    1518         162 :         dfXOff = psExtraArg->dfXOff;
    1519         162 :         dfYOff = psExtraArg->dfYOff;
    1520         162 :         dfXSize = psExtraArg->dfXSize;
    1521         162 :         dfYSize = psExtraArg->dfYSize;
    1522             :     }
    1523             : 
    1524         284 :     const double dfXRatioDstToSrc = dfXSize / nBufXSize;
    1525         284 :     const double dfYRatioDstToSrc = dfYSize / nBufYSize;
    1526             : 
    1527             :     // Determine the coordinates in the "virtual" output raster to see
    1528             :     // if there are not integers, in which case we will use them as a shift
    1529             :     // so that subwindow extracts give the exact same results as entire raster
    1530             :     // scaling.
    1531         284 :     double dfDestXOff = dfXOff / dfXRatioDstToSrc;
    1532         284 :     bool bHasXOffVirtual = false;
    1533         284 :     int nDestXOffVirtual = 0;
    1534         284 :     if (fabs(dfDestXOff - static_cast<int>(dfDestXOff + 0.5)) < 1e-8)
    1535             :     {
    1536         161 :         bHasXOffVirtual = true;
    1537         161 :         dfXOff = nXOff;
    1538         161 :         nDestXOffVirtual = static_cast<int>(dfDestXOff + 0.5);
    1539             :     }
    1540             : 
    1541         284 :     double dfDestYOff = dfYOff / dfYRatioDstToSrc;
    1542         284 :     bool bHasYOffVirtual = false;
    1543         284 :     int nDestYOffVirtual = 0;
    1544         284 :     if (fabs(dfDestYOff - static_cast<int>(dfDestYOff + 0.5)) < 1e-8)
    1545             :     {
    1546         120 :         bHasYOffVirtual = true;
    1547         120 :         dfYOff = nYOff;
    1548         120 :         nDestYOffVirtual = static_cast<int>(dfDestYOff + 0.5);
    1549             :     }
    1550             : 
    1551             :     // Create a MEM dataset that wraps the output buffer.
    1552             :     GDALDataset *poMEMDS =
    1553         284 :         MEMDataset::Create("", nDestXOffVirtual + nBufXSize,
    1554             :                            nDestYOffVirtual + nBufYSize, 0, eBufType, nullptr);
    1555             :     GDALRasterBand **papoDstBands = static_cast<GDALRasterBand **>(
    1556         280 :         CPLMalloc(nBandCount * sizeof(GDALRasterBand *)));
    1557         275 :     int nNBITS = 0;
    1558        1230 :     for (int i = 0; i < nBandCount; i++)
    1559             :     {
    1560         949 :         char szBuffer[32] = {'\0'};
    1561        1917 :         int nRet = CPLPrintPointer(
    1562             :             szBuffer,
    1563         949 :             static_cast<GByte *>(pData) - nPixelSpace * nDestXOffVirtual -
    1564         949 :                 nLineSpace * nDestYOffVirtual + nBandSpace * i,
    1565             :             sizeof(szBuffer));
    1566         968 :         szBuffer[nRet] = 0;
    1567             : 
    1568         968 :         char szBuffer0[64] = {'\0'};
    1569         968 :         snprintf(szBuffer0, sizeof(szBuffer0), "DATAPOINTER=%s", szBuffer);
    1570             : 
    1571         968 :         char szBuffer1[64] = {'\0'};
    1572         968 :         snprintf(szBuffer1, sizeof(szBuffer1), "PIXELOFFSET=" CPL_FRMT_GIB,
    1573             :                  static_cast<GIntBig>(nPixelSpace));
    1574             : 
    1575         968 :         char szBuffer2[64] = {'\0'};
    1576         968 :         snprintf(szBuffer2, sizeof(szBuffer2), "LINEOFFSET=" CPL_FRMT_GIB,
    1577             :                  static_cast<GIntBig>(nLineSpace));
    1578             : 
    1579         968 :         char *apszOptions[4] = {szBuffer0, szBuffer1, szBuffer2, nullptr};
    1580             : 
    1581         968 :         poMEMDS->AddBand(eBufType, apszOptions);
    1582             : 
    1583         965 :         GDALRasterBand *poSrcBand = GetRasterBand(panBandMap[i]);
    1584         951 :         papoDstBands[i] = poMEMDS->GetRasterBand(i + 1);
    1585             :         const char *pszNBITS =
    1586         954 :             poSrcBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    1587         954 :         if (pszNBITS)
    1588             :         {
    1589           0 :             nNBITS = atoi(pszNBITS);
    1590           0 :             poMEMDS->GetRasterBand(i + 1)->SetMetadataItem("NBITS", pszNBITS,
    1591           0 :                                                            "IMAGE_STRUCTURE");
    1592             :         }
    1593             :     }
    1594             : 
    1595         281 :     CPLErr eErr = CE_None;
    1596             : 
    1597             :     // TODO(schwehr): Why disabled?  Why not just delete?
    1598             :     // Looks like this code was initially added as disable by copying
    1599             :     // from RasterIO here:
    1600             :     // https://trac.osgeo.org/gdal/changeset/29572
    1601             : #if 0
    1602             :     // Do the resampling.
    1603             :     if( bUseWarp )
    1604             :     {
    1605             :         VRTDatasetH hVRTDS = nullptr;
    1606             :         GDALRasterBandH hVRTBand = nullptr;
    1607             :         if( GetDataset() == nullptr )
    1608             :         {
    1609             :             /* Create VRT dataset that wraps the whole dataset */
    1610             :             hVRTDS = VRTCreate(nRasterXSize, nRasterYSize);
    1611             :             VRTAddBand( hVRTDS, eDataType, nullptr );
    1612             :             hVRTBand = GDALGetRasterBand(hVRTDS, 1);
    1613             :             VRTAddSimpleSource( (VRTSourcedRasterBandH)hVRTBand,
    1614             :                                 (GDALRasterBandH)this,
    1615             :                                 0, 0,
    1616             :                                 nRasterXSize, nRasterYSize,
    1617             :                                 0, 0,
    1618             :                                 nRasterXSize, nRasterYSize,
    1619             :                                 nullptr, VRT_NODATA_UNSET );
    1620             : 
    1621             :             /* Add a mask band if needed */
    1622             :             if( GetMaskFlags() != GMF_ALL_VALID )
    1623             :             {
    1624             :                 ((GDALDataset*)hVRTDS)->CreateMaskBand(0);
    1625             :                 VRTSourcedRasterBand* poVRTMaskBand =
    1626             :                     (VRTSourcedRasterBand*)(((GDALRasterBand*)hVRTBand)->GetMaskBand());
    1627             :                 poVRTMaskBand->
    1628             :                     AddMaskBandSource( this,
    1629             :                                     0, 0,
    1630             :                                     nRasterXSize, nRasterYSize,
    1631             :                                     0, 0,
    1632             :                                     nRasterXSize, nRasterYSize);
    1633             :             }
    1634             :         }
    1635             : 
    1636             :         GDALWarpOptions* psWarpOptions = GDALCreateWarpOptions();
    1637             :         psWarpOptions->eResampleAlg = (GDALResampleAlg)psExtraArg->eResampleAlg;
    1638             :         psWarpOptions->hSrcDS = (GDALDatasetH) (hVRTDS ? hVRTDS : GetDataset());
    1639             :         psWarpOptions->hDstDS = (GDALDatasetH) poMEMDS;
    1640             :         psWarpOptions->nBandCount = 1;
    1641             :         int nSrcBandNumber = (hVRTDS ? 1 : nBand);
    1642             :         int nDstBandNumber = 1;
    1643             :         psWarpOptions->panSrcBands = &nSrcBandNumber;
    1644             :         psWarpOptions->panDstBands = &nDstBandNumber;
    1645             :         psWarpOptions->pfnProgress = psExtraArg->pfnProgress ?
    1646             :                     psExtraArg->pfnProgress : GDALDummyProgress;
    1647             :         psWarpOptions->pProgressArg = psExtraArg->pProgressData;
    1648             :         psWarpOptions->pfnTransformer = GDALRasterIOTransformer;
    1649             :         GDALRasterIOTransformerStruct sTransformer;
    1650             :         sTransformer.dfXOff = bHasXOffVirtual ? 0 : dfXOff;
    1651             :         sTransformer.dfYOff = bHasYOffVirtual ? 0 : dfYOff;
    1652             :         sTransformer.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1653             :         sTransformer.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1654             :         psWarpOptions->pTransformerArg = &sTransformer;
    1655             : 
    1656             :         GDALWarpOperationH hWarpOperation = GDALCreateWarpOperation(psWarpOptions);
    1657             :         eErr = GDALChunkAndWarpImage( hWarpOperation,
    1658             :                                       nDestXOffVirtual, nDestYOffVirtual,
    1659             :                                       nBufXSize, nBufYSize );
    1660             :         GDALDestroyWarpOperation( hWarpOperation );
    1661             : 
    1662             :         psWarpOptions->panSrcBands = nullptr;
    1663             :         psWarpOptions->panDstBands = nullptr;
    1664             :         GDALDestroyWarpOptions( psWarpOptions );
    1665             : 
    1666             :         if( hVRTDS )
    1667             :             GDALClose(hVRTDS);
    1668             :     }
    1669             :     else
    1670             : #endif
    1671             :     {
    1672         281 :         const char *pszResampling =
    1673         441 :             (psExtraArg->eResampleAlg == GRIORA_Bilinear)      ? "BILINEAR"
    1674         160 :             : (psExtraArg->eResampleAlg == GRIORA_Cubic)       ? "CUBIC"
    1675           0 :             : (psExtraArg->eResampleAlg == GRIORA_CubicSpline) ? "CUBICSPLINE"
    1676           0 :             : (psExtraArg->eResampleAlg == GRIORA_Lanczos)     ? "LANCZOS"
    1677           0 :             : (psExtraArg->eResampleAlg == GRIORA_Average)     ? "AVERAGE"
    1678           0 :             : (psExtraArg->eResampleAlg == GRIORA_RMS)         ? "RMS"
    1679           0 :             : (psExtraArg->eResampleAlg == GRIORA_Mode)        ? "MODE"
    1680           0 :             : (psExtraArg->eResampleAlg == GRIORA_Gauss)       ? "GAUSS"
    1681             :                                                                : "UNKNOWN";
    1682             : 
    1683         281 :         GDALRasterBand *poFirstSrcBand = GetRasterBand(panBandMap[0]);
    1684         278 :         GDALDataType eDataType = poFirstSrcBand->GetRasterDataType();
    1685             :         int nBlockXSize, nBlockYSize;
    1686         273 :         poFirstSrcBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    1687             : 
    1688             :         int nKernelRadius;
    1689             :         GDALResampleFunction pfnResampleFunc =
    1690         273 :             GDALGetResampleFunction(pszResampling, &nKernelRadius);
    1691         273 :         CPLAssert(pfnResampleFunc);
    1692             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1693             :         GDALResampleFunctionMultiBands pfnResampleFuncMultiBands =
    1694             :             GDALGetResampleFunctionMultiBands(pszResampling, &nKernelRadius);
    1695             : #endif
    1696             :         GDALDataType eWrkDataType =
    1697         273 :             GDALGetOvrWorkDataType(pszResampling, eDataType);
    1698             : 
    1699         271 :         int nDstBlockXSize = nBufXSize;
    1700         271 :         int nDstBlockYSize = nBufYSize;
    1701             :         int nFullResXChunk, nFullResYChunk;
    1702             :         while (true)
    1703             :         {
    1704         271 :             nFullResXChunk =
    1705         271 :                 3 + static_cast<int>(nDstBlockXSize * dfXRatioDstToSrc);
    1706         271 :             nFullResYChunk =
    1707         271 :                 3 + static_cast<int>(nDstBlockYSize * dfYRatioDstToSrc);
    1708         271 :             if (nFullResXChunk > nRasterXSize)
    1709         151 :                 nFullResXChunk = nRasterXSize;
    1710         271 :             if (nFullResYChunk > nRasterYSize)
    1711          33 :                 nFullResYChunk = nRasterYSize;
    1712         271 :             if ((nDstBlockXSize == 1 && nDstBlockYSize == 1) ||
    1713         269 :                 (static_cast<GIntBig>(nFullResXChunk) * nFullResYChunk <=
    1714             :                  1024 * 1024))
    1715             :                 break;
    1716             :             // When operating on the full width of a raster whose block width is
    1717             :             // the raster width, prefer doing chunks in height.
    1718           0 :             if (nFullResXChunk >= nXSize && nXSize == nBlockXSize &&
    1719             :                 nDstBlockYSize > 1)
    1720           0 :                 nDstBlockYSize /= 2;
    1721             :             /* Otherwise cut the maximal dimension */
    1722           0 :             else if (nDstBlockXSize > 1 &&
    1723           0 :                      (nFullResXChunk > nFullResYChunk || nDstBlockYSize == 1))
    1724           0 :                 nDstBlockXSize /= 2;
    1725             :             else
    1726           0 :                 nDstBlockYSize /= 2;
    1727             :         }
    1728             : 
    1729         541 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    1730         271 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    1731         270 :         if (nOvrFactor == 0)
    1732          94 :             nOvrFactor = 1;
    1733         270 :         int nFullResXSizeQueried =
    1734         270 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    1735         270 :         int nFullResYSizeQueried =
    1736         270 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    1737             : 
    1738         270 :         if (nFullResXSizeQueried > nRasterXSize)
    1739         154 :             nFullResXSizeQueried = nRasterXSize;
    1740         270 :         if (nFullResYSizeQueried > nRasterYSize)
    1741          36 :             nFullResYSizeQueried = nRasterYSize;
    1742             : 
    1743         270 :         void *pChunk = VSI_MALLOC3_VERBOSE(
    1744             :             cpl::fits_on<int>(GDALGetDataTypeSizeBytes(eWrkDataType) *
    1745             :                               nBandCount),
    1746             :             nFullResXSizeQueried, nFullResYSizeQueried);
    1747         284 :         GByte *pabyChunkNoDataMask = nullptr;
    1748             : 
    1749         284 :         GDALRasterBand *poMaskBand = poFirstSrcBand->GetMaskBand();
    1750         280 :         int nMaskFlags = poFirstSrcBand->GetMaskFlags();
    1751             : 
    1752         280 :         bool bUseNoDataMask = ((nMaskFlags & GMF_ALL_VALID) == 0);
    1753         280 :         if (bUseNoDataMask)
    1754             :         {
    1755          55 :             pabyChunkNoDataMask = static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    1756             :                 nFullResXSizeQueried, nFullResYSizeQueried));
    1757             :         }
    1758         280 :         if (pChunk == nullptr ||
    1759          55 :             (bUseNoDataMask && pabyChunkNoDataMask == nullptr))
    1760             :         {
    1761           1 :             GDALClose(poMEMDS);
    1762           0 :             CPLFree(pChunk);
    1763           0 :             CPLFree(pabyChunkNoDataMask);
    1764           0 :             CPLFree(papoDstBands);
    1765           0 :             return CE_Failure;
    1766             :         }
    1767             : 
    1768         279 :         int nTotalBlocks = ((nBufXSize + nDstBlockXSize - 1) / nDstBlockXSize) *
    1769         279 :                            ((nBufYSize + nDstBlockYSize - 1) / nDstBlockYSize);
    1770         279 :         int nBlocksDone = 0;
    1771             : 
    1772             :         int nDstYOff;
    1773         566 :         for (nDstYOff = 0; nDstYOff < nBufYSize && eErr == CE_None;
    1774         287 :              nDstYOff += nDstBlockYSize)
    1775             :         {
    1776             :             int nDstYCount;
    1777         272 :             if (nDstYOff + nDstBlockYSize <= nBufYSize)
    1778         273 :                 nDstYCount = nDstBlockYSize;
    1779             :             else
    1780           0 :                 nDstYCount = nBufYSize - nDstYOff;
    1781             : 
    1782         272 :             int nChunkYOff =
    1783         272 :                 nYOff + static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    1784         272 :             int nChunkYOff2 = nYOff + 1 +
    1785         272 :                               static_cast<int>(ceil((nDstYOff + nDstYCount) *
    1786             :                                                     dfYRatioDstToSrc));
    1787         272 :             if (nChunkYOff2 > nRasterYSize)
    1788          56 :                 nChunkYOff2 = nRasterYSize;
    1789         272 :             int nYCount = nChunkYOff2 - nChunkYOff;
    1790         272 :             CPLAssert(nYCount <= nFullResYChunk);
    1791             : 
    1792         272 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    1793         272 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    1794         272 :             if (nChunkYOffQueried < 0)
    1795             :             {
    1796          56 :                 nChunkYSizeQueried += nChunkYOffQueried;
    1797          56 :                 nChunkYOffQueried = 0;
    1798             :             }
    1799         272 :             if (nChunkYSizeQueried + nChunkYOffQueried > nRasterYSize)
    1800          66 :                 nChunkYSizeQueried = nRasterYSize - nChunkYOffQueried;
    1801         272 :             CPLAssert(nChunkYSizeQueried <= nFullResYSizeQueried);
    1802             : 
    1803             :             int nDstXOff;
    1804         556 :             for (nDstXOff = 0; nDstXOff < nBufXSize && eErr == CE_None;
    1805         284 :                  nDstXOff += nDstBlockXSize)
    1806             :             {
    1807             :                 int nDstXCount;
    1808         269 :                 if (nDstXOff + nDstBlockXSize <= nBufXSize)
    1809         268 :                     nDstXCount = nDstBlockXSize;
    1810             :                 else
    1811           1 :                     nDstXCount = nBufXSize - nDstXOff;
    1812             : 
    1813         269 :                 int nChunkXOff =
    1814         269 :                     nXOff + static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    1815         269 :                 int nChunkXOff2 =
    1816         269 :                     nXOff + 1 +
    1817         269 :                     static_cast<int>(
    1818         269 :                         ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    1819         269 :                 if (nChunkXOff2 > nRasterXSize)
    1820         144 :                     nChunkXOff2 = nRasterXSize;
    1821         269 :                 int nXCount = nChunkXOff2 - nChunkXOff;
    1822         269 :                 CPLAssert(nXCount <= nFullResXChunk);
    1823             : 
    1824         269 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    1825         269 :                 int nChunkXSizeQueried =
    1826         269 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    1827         269 :                 if (nChunkXOffQueried < 0)
    1828             :                 {
    1829         144 :                     nChunkXSizeQueried += nChunkXOffQueried;
    1830         144 :                     nChunkXOffQueried = 0;
    1831             :                 }
    1832         269 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nRasterXSize)
    1833         150 :                     nChunkXSizeQueried = nRasterXSize - nChunkXOffQueried;
    1834         269 :                 CPLAssert(nChunkXSizeQueried <= nFullResXSizeQueried);
    1835             : 
    1836         269 :                 bool bSkipResample = false;
    1837         269 :                 bool bNoDataMaskFullyOpaque = false;
    1838         269 :                 if (eErr == CE_None && bUseNoDataMask)
    1839             :                 {
    1840          55 :                     eErr = poMaskBand->RasterIO(
    1841             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1842             :                         nChunkXSizeQueried, nChunkYSizeQueried,
    1843             :                         pabyChunkNoDataMask, nChunkXSizeQueried,
    1844             :                         nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    1845             : 
    1846             :                     /* Optimizations if mask if fully opaque or transparent */
    1847          55 :                     const int nPixels = nChunkXSizeQueried * nChunkYSizeQueried;
    1848          55 :                     const GByte bVal = pabyChunkNoDataMask[0];
    1849          55 :                     int i = 1;  // Used after for.
    1850      123794 :                     for (; i < nPixels; i++)
    1851             :                     {
    1852      123777 :                         if (pabyChunkNoDataMask[i] != bVal)
    1853          38 :                             break;
    1854             :                     }
    1855          55 :                     if (i == nPixels)
    1856             :                     {
    1857          17 :                         if (bVal == 0)
    1858             :                         {
    1859          16 :                             GByte abyZero[16] = {0};
    1860          64 :                             for (int iBand = 0; iBand < nBandCount; iBand++)
    1861             :                             {
    1862        2016 :                                 for (int j = 0; j < nDstYCount; j++)
    1863             :                                 {
    1864        1968 :                                     GDALCopyWords64(
    1865             :                                         abyZero, GDT_Byte, 0,
    1866             :                                         static_cast<GByte *>(pData) +
    1867        1968 :                                             iBand * nBandSpace +
    1868        1968 :                                             nLineSpace * (j + nDstYOff) +
    1869        1968 :                                             nDstXOff * nPixelSpace,
    1870             :                                         eBufType, static_cast<int>(nPixelSpace),
    1871             :                                         nDstXCount);
    1872             :                                 }
    1873             :                             }
    1874          16 :                             bSkipResample = true;
    1875             :                         }
    1876             :                         else
    1877             :                         {
    1878           1 :                             bNoDataMaskFullyOpaque = true;
    1879             :                         }
    1880             :                     }
    1881             :                 }
    1882             : 
    1883         269 :                 if (!bSkipResample && eErr == CE_None)
    1884             :                 {
    1885             :                     /* Read the source buffers */
    1886         253 :                     eErr = RasterIO(
    1887             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    1888             :                         nChunkXSizeQueried, nChunkYSizeQueried, pChunk,
    1889             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType,
    1890             :                         nBandCount, panBandMap, 0, 0, 0, nullptr);
    1891             :                 }
    1892             : 
    1893             : #ifdef GDAL_ENABLE_RESAMPLING_MULTIBAND
    1894             :                 if (pfnResampleFuncMultiBands && !bSkipResample &&
    1895             :                     eErr == CE_None)
    1896             :                 {
    1897             :                     eErr = pfnResampleFuncMultiBands(
    1898             :                         dfXRatioDstToSrc, dfYRatioDstToSrc,
    1899             :                         dfXOff - nXOff, /* == 0 if bHasXOffVirtual */
    1900             :                         dfYOff - nYOff, /* == 0 if bHasYOffVirtual */
    1901             :                         eWrkDataType, (GByte *)pChunk, nBandCount,
    1902             :                         bNoDataMaskFullyOpaque ? nullptr : pabyChunkNoDataMask,
    1903             :                         nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff),
    1904             :                         nChunkXSizeQueried,
    1905             :                         nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff),
    1906             :                         nChunkYSizeQueried, nDstXOff + nDestXOffVirtual,
    1907             :                         nDstXOff + nDestXOffVirtual + nDstXCount,
    1908             :                         nDstYOff + nDestYOffVirtual,
    1909             :                         nDstYOff + nDestYOffVirtual + nDstYCount, papoDstBands,
    1910             :                         pszResampling, FALSE /*bHasNoData*/,
    1911             :                         0.0 /* dfNoDataValue */, nullptr /* color table*/,
    1912             :                         eDataType);
    1913             :                 }
    1914             :                 else
    1915             : #endif
    1916             :                 {
    1917             :                     size_t nChunkBandOffset =
    1918         281 :                         static_cast<size_t>(nChunkXSizeQueried) *
    1919         281 :                         nChunkYSizeQueried *
    1920         281 :                         GDALGetDataTypeSizeBytes(eWrkDataType);
    1921        1205 :                     for (int i = 0;
    1922        1205 :                          i < nBandCount && !bSkipResample && eErr == CE_None;
    1923             :                          i++)
    1924             :                     {
    1925         921 :                         const bool bPropagateNoData = false;
    1926         921 :                         void *pDstBuffer = nullptr;
    1927         921 :                         GDALDataType eDstBufferDataType = GDT_Unknown;
    1928             :                         GDALRasterBand *poMEMBand =
    1929         921 :                             poMEMDS->GetRasterBand(i + 1);
    1930         922 :                         GDALOverviewResampleArgs args;
    1931         922 :                         args.eSrcDataType = eDataType;
    1932         922 :                         args.eOvrDataType = poMEMBand->GetRasterDataType();
    1933         921 :                         args.nOvrXSize = poMEMBand->GetXSize();
    1934         920 :                         args.nOvrYSize = poMEMBand->GetYSize();
    1935         917 :                         args.nOvrNBITS = nNBITS;
    1936         917 :                         args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    1937         917 :                         args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    1938         917 :                         args.dfSrcXDelta =
    1939         917 :                             dfXOff - nXOff; /* == 0 if bHasXOffVirtual */
    1940         917 :                         args.dfSrcYDelta =
    1941         917 :                             dfYOff - nYOff; /* == 0 if bHasYOffVirtual */
    1942         917 :                         args.eWrkDataType = eWrkDataType;
    1943         917 :                         args.pabyChunkNodataMask = bNoDataMaskFullyOpaque
    1944         917 :                                                        ? nullptr
    1945             :                                                        : pabyChunkNoDataMask;
    1946         917 :                         args.nChunkXOff =
    1947         917 :                             nChunkXOffQueried - (bHasXOffVirtual ? 0 : nXOff);
    1948         917 :                         args.nChunkXSize = nChunkXSizeQueried;
    1949         917 :                         args.nChunkYOff =
    1950         917 :                             nChunkYOffQueried - (bHasYOffVirtual ? 0 : nYOff);
    1951         917 :                         args.nChunkYSize = nChunkYSizeQueried;
    1952         917 :                         args.nDstXOff = nDstXOff + nDestXOffVirtual;
    1953         917 :                         args.nDstXOff2 =
    1954         917 :                             nDstXOff + nDestXOffVirtual + nDstXCount;
    1955         917 :                         args.nDstYOff = nDstYOff + nDestYOffVirtual;
    1956         917 :                         args.nDstYOff2 =
    1957         917 :                             nDstYOff + nDestYOffVirtual + nDstYCount;
    1958         917 :                         args.pszResampling = pszResampling;
    1959         917 :                         args.bHasNoData = false;
    1960         917 :                         args.dfNoDataValue = 0.0;
    1961         917 :                         args.poColorTable = nullptr;
    1962         917 :                         args.bPropagateNoData = bPropagateNoData;
    1963             : 
    1964             :                         eErr =
    1965        1839 :                             pfnResampleFunc(args,
    1966         917 :                                             reinterpret_cast<GByte *>(pChunk) +
    1967         917 :                                                 i * nChunkBandOffset,
    1968             :                                             &pDstBuffer, &eDstBufferDataType);
    1969         922 :                         if (eErr == CE_None)
    1970             :                         {
    1971         922 :                             eErr = poMEMBand->RasterIO(
    1972             :                                 GF_Write, nDstXOff + nDestXOffVirtual,
    1973             :                                 nDstYOff + nDestYOffVirtual, nDstXCount,
    1974             :                                 nDstYCount, pDstBuffer, nDstXCount, nDstYCount,
    1975             :                                 eDstBufferDataType, 0, 0, nullptr);
    1976             :                         }
    1977         922 :                         CPLFree(pDstBuffer);
    1978             :                     }
    1979             :                 }
    1980             : 
    1981         284 :                 nBlocksDone++;
    1982         286 :                 if (eErr == CE_None && psExtraArg->pfnProgress != nullptr &&
    1983           2 :                     !psExtraArg->pfnProgress(1.0 * nBlocksDone / nTotalBlocks,
    1984             :                                              "", psExtraArg->pProgressData))
    1985             :                 {
    1986           0 :                     eErr = CE_Failure;
    1987             :                 }
    1988             :             }
    1989             :         }
    1990             : 
    1991         294 :         CPLFree(pChunk);
    1992         284 :         CPLFree(pabyChunkNoDataMask);
    1993             :     }
    1994             : 
    1995         284 :     CPLFree(papoDstBands);
    1996         284 :     GDALClose(poMEMDS);
    1997             : 
    1998         284 :     return eErr;
    1999             : }
    2000             : 
    2001             : //! @endcond
    2002             : 
    2003             : /************************************************************************/
    2004             : /*                           GDALSwapWords()                            */
    2005             : /************************************************************************/
    2006             : 
    2007             : /**
    2008             :  * Byte swap words in-place.
    2009             :  *
    2010             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2011             :  * a memory array.  No assumption is made that the words being swapped are
    2012             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2013             :  * to determine if the current platform is big endian or little endian.  Use
    2014             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2015             :  * of a function call.
    2016             :  *
    2017             :  * @param pData pointer to start of data buffer.
    2018             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2019             :  * @param nWordCount the number of words to be swapped in this call.
    2020             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2021             :  * the next. For packed buffers this is the same as nWordSize.
    2022             :  */
    2023             : 
    2024      438669 : void CPL_STDCALL GDALSwapWords(void *pData, int nWordSize, int nWordCount,
    2025             :                                int nWordSkip)
    2026             : 
    2027             : {
    2028      438669 :     if (nWordCount > 0)
    2029      438669 :         VALIDATE_POINTER0(pData, "GDALSwapWords");
    2030             : 
    2031      438669 :     GByte *pabyData = static_cast<GByte *>(pData);
    2032             : 
    2033      438669 :     switch (nWordSize)
    2034             :     {
    2035        7234 :         case 1:
    2036        7234 :             break;
    2037             : 
    2038      418175 :         case 2:
    2039      418175 :             CPLAssert(nWordSkip >= 2 || nWordCount == 1);
    2040   289160000 :             for (int i = 0; i < nWordCount; i++)
    2041             :             {
    2042   288742000 :                 CPL_SWAP16PTR(pabyData);
    2043   288742000 :                 pabyData += nWordSkip;
    2044             :             }
    2045      418175 :             break;
    2046             : 
    2047       10689 :         case 4:
    2048       10689 :             CPLAssert(nWordSkip >= 4 || nWordCount == 1);
    2049       10689 :             if (CPL_IS_ALIGNED(pabyData, 4) && (nWordSkip % 4) == 0)
    2050             :             {
    2051    29148800 :                 for (int i = 0; i < nWordCount; i++)
    2052             :                 {
    2053    29138100 :                     *reinterpret_cast<GUInt32 *>(pabyData) = CPL_SWAP32(
    2054             :                         *reinterpret_cast<const GUInt32 *>(pabyData));
    2055    29138100 :                     pabyData += nWordSkip;
    2056       10686 :                 }
    2057             :             }
    2058             :             else
    2059             :             {
    2060           9 :                 for (int i = 0; i < nWordCount; i++)
    2061             :                 {
    2062           6 :                     CPL_SWAP32PTR(pabyData);
    2063           6 :                     pabyData += nWordSkip;
    2064             :                 }
    2065             :             }
    2066       10689 :             break;
    2067             : 
    2068        2571 :         case 8:
    2069        2571 :             CPLAssert(nWordSkip >= 8 || nWordCount == 1);
    2070        2571 :             if (CPL_IS_ALIGNED(pabyData, 8) && (nWordSkip % 8) == 0)
    2071             :             {
    2072     3359870 :                 for (int i = 0; i < nWordCount; i++)
    2073             :                 {
    2074     3357300 :                     *reinterpret_cast<GUInt64 *>(pabyData) = CPL_SWAP64(
    2075             :                         *reinterpret_cast<const GUInt64 *>(pabyData));
    2076     3357300 :                     pabyData += nWordSkip;
    2077        2570 :                 }
    2078             :             }
    2079             :             else
    2080             :             {
    2081           3 :                 for (int i = 0; i < nWordCount; i++)
    2082             :                 {
    2083           2 :                     CPL_SWAP64PTR(pabyData);
    2084           2 :                     pabyData += nWordSkip;
    2085             :                 }
    2086             :             }
    2087        2571 :             break;
    2088             : 
    2089           0 :         default:
    2090           0 :             CPLAssert(false);
    2091             :     }
    2092             : }
    2093             : 
    2094             : /************************************************************************/
    2095             : /*                           GDALSwapWordsEx()                          */
    2096             : /************************************************************************/
    2097             : 
    2098             : /**
    2099             :  * Byte swap words in-place.
    2100             :  *
    2101             :  * This function will byte swap a set of 2, 4 or 8 byte words "in place" in
    2102             :  * a memory array.  No assumption is made that the words being swapped are
    2103             :  * word aligned in memory.  Use the CPL_LSB and CPL_MSB macros from cpl_port.h
    2104             :  * to determine if the current platform is big endian or little endian.  Use
    2105             :  * The macros like CPL_SWAP32() to byte swap single values without the overhead
    2106             :  * of a function call.
    2107             :  *
    2108             :  * @param pData pointer to start of data buffer.
    2109             :  * @param nWordSize size of words being swapped in bytes. Normally 2, 4 or 8.
    2110             :  * @param nWordCount the number of words to be swapped in this call.
    2111             :  * @param nWordSkip the byte offset from the start of one word to the start of
    2112             :  * the next. For packed buffers this is the same as nWordSize.
    2113             :  * @since GDAL 2.1
    2114             :  */
    2115        6378 : void CPL_STDCALL GDALSwapWordsEx(void *pData, int nWordSize, size_t nWordCount,
    2116             :                                  int nWordSkip)
    2117             : {
    2118        6378 :     GByte *pabyData = static_cast<GByte *>(pData);
    2119       12756 :     while (nWordCount)
    2120             :     {
    2121             :         // Pick-up a multiple of 8 as max chunk size.
    2122        6378 :         const int nWordCountSmall =
    2123        6378 :             (nWordCount > (1 << 30)) ? (1 << 30) : static_cast<int>(nWordCount);
    2124        6378 :         GDALSwapWords(pabyData, nWordSize, nWordCountSmall, nWordSkip);
    2125        6378 :         pabyData += static_cast<size_t>(nWordSkip) * nWordCountSmall;
    2126        6378 :         nWordCount -= nWordCountSmall;
    2127             :     }
    2128        6378 : }
    2129             : 
    2130             : // Place the new GDALCopyWords helpers in an anonymous namespace
    2131             : namespace
    2132             : {
    2133             : 
    2134             : /************************************************************************/
    2135             : /*                           GDALCopyWordsT()                           */
    2136             : /************************************************************************/
    2137             : /**
    2138             :  * Template function, used to copy data from pSrcData into buffer
    2139             :  * pDstData, with stride nSrcPixelStride in the source data and
    2140             :  * stride nDstPixelStride in the destination data. This template can
    2141             :  * deal with the case where the input data type is real or complex and
    2142             :  * the output is real.
    2143             :  *
    2144             :  * @param pSrcData the source data buffer
    2145             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2146             :  *                      of interest.
    2147             :  * @param pDstData the destination buffer.
    2148             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2149             :  *                      interest.
    2150             :  * @param nWordCount the total number of pixel words to copy
    2151             :  *
    2152             :  * @code
    2153             :  * // Assume an input buffer of type GUInt16 named pBufferIn
    2154             :  * GByte *pBufferOut = new GByte[numBytesOut];
    2155             :  * GDALCopyWordsT<GUInt16, GByte>(pSrcData, 2, pDstData, 1, numBytesOut);
    2156             :  * @endcode
    2157             :  * @note
    2158             :  * This is a private function, and should not be exposed outside of
    2159             :  * rasterio.cpp. External users should call the GDALCopyWords driver function.
    2160             :  */
    2161             : 
    2162             : template <class Tin, class Tout>
    2163    46725913 : static void inline GDALCopyWordsGenericT(const Tin *const CPL_RESTRICT pSrcData,
    2164             :                                          int nSrcPixelStride,
    2165             :                                          Tout *const CPL_RESTRICT pDstData,
    2166             :                                          int nDstPixelStride,
    2167             :                                          GPtrDiff_t nWordCount)
    2168             : {
    2169    46725913 :     decltype(nWordCount) nDstOffset = 0;
    2170             : 
    2171    46725913 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2172    46725913 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2173   601037742 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2174             :     {
    2175   554311580 :         const Tin tValue =
    2176   554311580 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2177   554311580 :         Tout *const pOutPixel =
    2178   554311580 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2179             : 
    2180   554311580 :         GDALCopyWord(tValue, *pOutPixel);
    2181             : 
    2182   554311680 :         nDstOffset += nDstPixelStride;
    2183             :     }
    2184    46726031 : }
    2185             : 
    2186             : template <class Tin, class Tout>
    2187    38279445 : static void inline GDALCopyWordsT(const Tin *const CPL_RESTRICT pSrcData,
    2188             :                                   int nSrcPixelStride,
    2189             :                                   Tout *const CPL_RESTRICT pDstData,
    2190             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    2191             : {
    2192    38279445 :     GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData, nDstPixelStride,
    2193             :                           nWordCount);
    2194    38279493 : }
    2195             : 
    2196             : template <class Tin, class Tout>
    2197      194358 : static void inline GDALCopyWordsT_8atatime(
    2198             :     const Tin *const CPL_RESTRICT pSrcData, int nSrcPixelStride,
    2199             :     Tout *const CPL_RESTRICT pDstData, int nDstPixelStride,
    2200             :     GPtrDiff_t nWordCount)
    2201             : {
    2202      194358 :     decltype(nWordCount) nDstOffset = 0;
    2203             : 
    2204      194358 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2205      194358 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2206      194358 :     decltype(nWordCount) n = 0;
    2207      194358 :     if (nSrcPixelStride == static_cast<int>(sizeof(Tin)) &&
    2208             :         nDstPixelStride == static_cast<int>(sizeof(Tout)))
    2209             :     {
    2210    22734780 :         for (; n < nWordCount - 7; n += 8)
    2211             :         {
    2212    22545358 :             const Tin *pInValues = reinterpret_cast<const Tin *>(
    2213    22545358 :                 pSrcDataPtr + (n * nSrcPixelStride));
    2214    22545358 :             Tout *const pOutPixels =
    2215    22545358 :                 reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2216             : 
    2217    22545358 :             GDALCopy8Words(pInValues, pOutPixels);
    2218             : 
    2219    22541328 :             nDstOffset += 8 * nDstPixelStride;
    2220             :         }
    2221             :     }
    2222      687692 :     for (; n < nWordCount; n++)
    2223             :     {
    2224      493331 :         const Tin tValue =
    2225      493331 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + (n * nSrcPixelStride));
    2226      493331 :         Tout *const pOutPixel =
    2227      493331 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2228             : 
    2229      493331 :         GDALCopyWord(tValue, *pOutPixel);
    2230             : 
    2231      497417 :         nDstOffset += nDstPixelStride;
    2232             :     }
    2233      194361 : }
    2234             : 
    2235             : #ifdef HAVE_SSE2
    2236             : 
    2237             : template <class Tout>
    2238       39381 : void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
    2239             :                               int nSrcPixelStride,
    2240             :                               Tout *const CPL_RESTRICT pDstData,
    2241             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2242             : {
    2243             :     static_assert(std::is_integral<Tout>::value &&
    2244             :                       sizeof(Tout) == sizeof(uint16_t),
    2245             :                   "Bad Tout");
    2246       39381 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2247             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2248             :     {
    2249       33330 :         decltype(nWordCount) n = 0;
    2250       33330 :         const __m128i xmm_zero = _mm_setzero_si128();
    2251       33330 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2252             :             reinterpret_cast<GByte *>(pDstData);
    2253     1501757 :         for (; n < nWordCount - 15; n += 16)
    2254             :         {
    2255     1468427 :             __m128i xmm = _mm_loadu_si128(
    2256     1468427 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2257     1468427 :             __m128i xmm0 = _mm_unpacklo_epi8(xmm, xmm_zero);
    2258     1468427 :             __m128i xmm1 = _mm_unpackhi_epi8(xmm, xmm_zero);
    2259             :             _mm_storeu_si128(
    2260     1468427 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2), xmm0);
    2261             :             _mm_storeu_si128(
    2262     1468427 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 2 + 16), xmm1);
    2263             :         }
    2264      108789 :         for (; n < nWordCount; n++)
    2265             :         {
    2266       75459 :             pDstData[n] = pSrcData[n];
    2267       33330 :         }
    2268             :     }
    2269             :     else
    2270             :     {
    2271        6051 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2272             :                               nDstPixelStride, nWordCount);
    2273             :     }
    2274       39381 : }
    2275             : 
    2276             : template <>
    2277       25764 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2278             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2279             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2280             : {
    2281       25764 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2282             :                              nDstPixelStride, nWordCount);
    2283       25764 : }
    2284             : 
    2285             : template <>
    2286       13617 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2287             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2288             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2289             : {
    2290       13617 :     GDALCopyWordsByteTo16Bit(pSrcData, nSrcPixelStride, pDstData,
    2291             :                              nDstPixelStride, nWordCount);
    2292       13617 : }
    2293             : 
    2294             : template <class Tout>
    2295    12270938 : void GDALCopyWordsByteTo32Bit(const GByte *const CPL_RESTRICT pSrcData,
    2296             :                               int nSrcPixelStride,
    2297             :                               Tout *const CPL_RESTRICT pDstData,
    2298             :                               int nDstPixelStride, GPtrDiff_t nWordCount)
    2299             : {
    2300             :     static_assert(std::is_integral<Tout>::value &&
    2301             :                       sizeof(Tout) == sizeof(uint32_t),
    2302             :                   "Bad Tout");
    2303    12270938 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2304             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2305             :     {
    2306     6210628 :         decltype(nWordCount) n = 0;
    2307     6210628 :         const __m128i xmm_zero = _mm_setzero_si128();
    2308     6210628 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2309             :             reinterpret_cast<GByte *>(pDstData);
    2310    68847252 :         for (; n < nWordCount - 15; n += 16)
    2311             :         {
    2312    62822524 :             __m128i xmm = _mm_loadu_si128(
    2313    62822524 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2314    62784524 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2315    62795624 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2316    62783724 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2317    62591024 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2318    62505924 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2319    62636624 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2320             :             _mm_storeu_si128(
    2321    62636624 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4), xmm0);
    2322             :             _mm_storeu_si128(
    2323    62636624 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 16), xmm1);
    2324             :             _mm_storeu_si128(
    2325    62636624 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 32), xmm2);
    2326             :             _mm_storeu_si128(
    2327    62636624 :                 reinterpret_cast<__m128i *>(pabyDstDataPtr + n * 4 + 48), xmm3);
    2328             :         }
    2329    14087639 :         for (; n < nWordCount; n++)
    2330             :         {
    2331     8062831 :             pDstData[n] = pSrcData[n];
    2332     6024788 :         }
    2333             :     }
    2334             :     else
    2335             :     {
    2336     6060320 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2337             :                               nDstPixelStride, nWordCount);
    2338             :     }
    2339    12081138 : }
    2340             : 
    2341             : template <>
    2342         438 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2343             :                     int nSrcPixelStride, GUInt32 *const CPL_RESTRICT pDstData,
    2344             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2345             : {
    2346         438 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2347             :                              nDstPixelStride, nWordCount);
    2348         438 : }
    2349             : 
    2350             : template <>
    2351    12272200 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2352             :                     int nSrcPixelStride, GInt32 *const CPL_RESTRICT pDstData,
    2353             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2354             : {
    2355    12272200 :     GDALCopyWordsByteTo32Bit(pSrcData, nSrcPixelStride, pDstData,
    2356             :                              nDstPixelStride, nWordCount);
    2357    12282000 : }
    2358             : 
    2359             : template <>
    2360     2470670 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2361             :                     int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
    2362             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2363             : {
    2364     2470670 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2365             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2366             :     {
    2367      111225 :         decltype(nWordCount) n = 0;
    2368      111225 :         const __m128i xmm_zero = _mm_setzero_si128();
    2369      111225 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2370             :             reinterpret_cast<GByte *>(pDstData);
    2371     3273060 :         for (; n < nWordCount - 15; n += 16)
    2372             :         {
    2373     3161840 :             __m128i xmm = _mm_loadu_si128(
    2374     3161840 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2375     3161840 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2376     3161840 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2377     3161840 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2378     3161840 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2379     3161840 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2380     3161840 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2381     3161840 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2382     3161840 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2383     3161840 :             __m128 xmm2_f = _mm_cvtepi32_ps(xmm2);
    2384     3161840 :             __m128 xmm3_f = _mm_cvtepi32_ps(xmm3);
    2385     3161840 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2386             :                           xmm0_f);
    2387             :             _mm_storeu_ps(
    2388     3161840 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2389             :             _mm_storeu_ps(
    2390     3161840 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 32), xmm2_f);
    2391             :             _mm_storeu_ps(
    2392     3161840 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 48), xmm3_f);
    2393             :         }
    2394      472813 :         for (; n < nWordCount; n++)
    2395             :         {
    2396      361588 :             pDstData[n] = pSrcData[n];
    2397      111225 :         }
    2398             :     }
    2399             :     else
    2400             :     {
    2401     2359440 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2402             :                               nDstPixelStride, nWordCount);
    2403             :     }
    2404     2470670 : }
    2405             : 
    2406             : template <>
    2407      146702 : void GDALCopyWordsT(const GByte *const CPL_RESTRICT pSrcData,
    2408             :                     int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
    2409             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2410             : {
    2411      146702 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2412             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2413             :     {
    2414      123720 :         decltype(nWordCount) n = 0;
    2415      123720 :         const __m128i xmm_zero = _mm_setzero_si128();
    2416      123720 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2417             :             reinterpret_cast<GByte *>(pDstData);
    2418     1421860 :         for (; n < nWordCount - 15; n += 16)
    2419             :         {
    2420     1298140 :             __m128i xmm = _mm_loadu_si128(
    2421     1298140 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2422     1298140 :             __m128i xmm_low = _mm_unpacklo_epi8(xmm, xmm_zero);
    2423     1298140 :             __m128i xmm_high = _mm_unpackhi_epi8(xmm, xmm_zero);
    2424     1298140 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm_low, xmm_zero);
    2425     1298140 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm_low, xmm_zero);
    2426     1298140 :             __m128i xmm2 = _mm_unpacklo_epi16(xmm_high, xmm_zero);
    2427     1298140 :             __m128i xmm3 = _mm_unpackhi_epi16(xmm_high, xmm_zero);
    2428             : 
    2429     1298140 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2430     1298140 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2431     1298140 :             __m128d xmm2_low_d = _mm_cvtepi32_pd(xmm2);
    2432     1298140 :             __m128d xmm3_low_d = _mm_cvtepi32_pd(xmm3);
    2433     1298140 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2434     1298140 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2435     1298140 :             xmm2 = _mm_srli_si128(xmm2, 8);
    2436     1298140 :             xmm3 = _mm_srli_si128(xmm3, 8);
    2437     1298140 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2438     1298140 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2439     1298140 :             __m128d xmm2_high_d = _mm_cvtepi32_pd(xmm2);
    2440     1298140 :             __m128d xmm3_high_d = _mm_cvtepi32_pd(xmm3);
    2441             : 
    2442     1298140 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2443             :                           xmm0_low_d);
    2444             :             _mm_storeu_pd(
    2445     1298140 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2446             :                 xmm0_high_d);
    2447             :             _mm_storeu_pd(
    2448     1298140 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2449             :                 xmm1_low_d);
    2450             :             _mm_storeu_pd(
    2451     1298140 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2452             :                 xmm1_high_d);
    2453             :             _mm_storeu_pd(
    2454     1298140 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 64),
    2455             :                 xmm2_low_d);
    2456             :             _mm_storeu_pd(
    2457     1298140 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 80),
    2458             :                 xmm2_high_d);
    2459             :             _mm_storeu_pd(
    2460     1298140 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 96),
    2461             :                 xmm3_low_d);
    2462             :             _mm_storeu_pd(
    2463     1298140 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 112),
    2464             :                 xmm3_high_d);
    2465             :         }
    2466      234770 :         for (; n < nWordCount; n++)
    2467             :         {
    2468      111050 :             pDstData[n] = pSrcData[n];
    2469      123720 :         }
    2470             :     }
    2471             :     else
    2472             :     {
    2473       22982 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2474             :                               nDstPixelStride, nWordCount);
    2475             :     }
    2476      146702 : }
    2477             : 
    2478             : template <>
    2479        6006 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2480             :                     int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
    2481             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2482             : {
    2483        6006 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2484             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2485             :     {
    2486        5031 :         decltype(nWordCount) n = 0;
    2487             :         // In SSE2, min_epu16 does not exist, so shift from
    2488             :         // UInt16 to SInt16 to be able to use min_epi16
    2489        5031 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2490        5031 :         const __m128i xmm_m255_shifted = _mm_set1_epi16(255 - 32768);
    2491      138471 :         for (; n < nWordCount - 7; n += 8)
    2492             :         {
    2493      133440 :             __m128i xmm = _mm_loadu_si128(
    2494      133440 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2495      133440 :             xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
    2496      133440 :             xmm = _mm_min_epi16(xmm, xmm_m255_shifted);
    2497      133440 :             xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
    2498      133440 :             xmm = _mm_packus_epi16(xmm, xmm);
    2499      133440 :             GDALCopyXMMToInt64(xmm,
    2500      133440 :                                reinterpret_cast<GPtrDiff_t *>(pDstData + n));
    2501             :         }
    2502       16005 :         for (; n < nWordCount; n++)
    2503             :         {
    2504       10974 :             pDstData[n] =
    2505       10974 :                 pSrcData[n] >= 255 ? 255 : static_cast<GByte>(pSrcData[n]);
    2506        5031 :         }
    2507             :     }
    2508             :     else
    2509             :     {
    2510         975 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2511             :                               nDstPixelStride, nWordCount);
    2512             :     }
    2513        6006 : }
    2514             : 
    2515             : template <>
    2516          21 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2517             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2518             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2519             : {
    2520          21 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2521             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2522             :     {
    2523          15 :         decltype(nWordCount) n = 0;
    2524             :         // In SSE2, min_epu16 does not exist, so shift from
    2525             :         // UInt16 to SInt16 to be able to use min_epi16
    2526          15 :         const __m128i xmm_UINT16_to_INT16 = _mm_set1_epi16(-32768);
    2527          15 :         const __m128i xmm_32767_shifted = _mm_set1_epi16(32767 - 32768);
    2528          31 :         for (; n < nWordCount - 7; n += 8)
    2529             :         {
    2530          16 :             __m128i xmm = _mm_loadu_si128(
    2531          16 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2532          16 :             xmm = _mm_add_epi16(xmm, xmm_UINT16_to_INT16);
    2533          16 :             xmm = _mm_min_epi16(xmm, xmm_32767_shifted);
    2534          16 :             xmm = _mm_sub_epi16(xmm, xmm_UINT16_to_INT16);
    2535          16 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDstData + n), xmm);
    2536             :         }
    2537          55 :         for (; n < nWordCount; n++)
    2538             :         {
    2539          40 :             pDstData[n] =
    2540          40 :                 pSrcData[n] >= 32767 ? 32767 : static_cast<GInt16>(pSrcData[n]);
    2541          15 :         }
    2542             :     }
    2543             :     else
    2544             :     {
    2545           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2546             :                               nDstPixelStride, nWordCount);
    2547             :     }
    2548          21 : }
    2549             : 
    2550             : template <>
    2551         412 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2552             :                     int nSrcPixelStride, float *const CPL_RESTRICT pDstData,
    2553             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2554             : {
    2555         412 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2556             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2557             :     {
    2558         406 :         decltype(nWordCount) n = 0;
    2559         406 :         const __m128i xmm_zero = _mm_setzero_si128();
    2560         406 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2561             :             reinterpret_cast<GByte *>(pDstData);
    2562        1500 :         for (; n < nWordCount - 7; n += 8)
    2563             :         {
    2564        1094 :             __m128i xmm = _mm_loadu_si128(
    2565        1094 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2566        1094 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2567        1094 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2568        1094 :             __m128 xmm0_f = _mm_cvtepi32_ps(xmm0);
    2569        1094 :             __m128 xmm1_f = _mm_cvtepi32_ps(xmm1);
    2570        1094 :             _mm_storeu_ps(reinterpret_cast<float *>(pabyDstDataPtr + n * 4),
    2571             :                           xmm0_f);
    2572             :             _mm_storeu_ps(
    2573        1094 :                 reinterpret_cast<float *>(pabyDstDataPtr + n * 4 + 16), xmm1_f);
    2574             :         }
    2575        1483 :         for (; n < nWordCount; n++)
    2576             :         {
    2577        1077 :             pDstData[n] = pSrcData[n];
    2578         406 :         }
    2579             :     }
    2580             :     else
    2581             :     {
    2582           6 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2583             :                               nDstPixelStride, nWordCount);
    2584             :     }
    2585         412 : }
    2586             : 
    2587             : template <>
    2588         279 : void GDALCopyWordsT(const GUInt16 *const CPL_RESTRICT pSrcData,
    2589             :                     int nSrcPixelStride, double *const CPL_RESTRICT pDstData,
    2590             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2591             : {
    2592         279 :     if (nSrcPixelStride == static_cast<int>(sizeof(*pSrcData)) &&
    2593             :         nDstPixelStride == static_cast<int>(sizeof(*pDstData)))
    2594             :     {
    2595         171 :         decltype(nWordCount) n = 0;
    2596         171 :         const __m128i xmm_zero = _mm_setzero_si128();
    2597         171 :         GByte *CPL_RESTRICT pabyDstDataPtr =
    2598             :             reinterpret_cast<GByte *>(pDstData);
    2599         219 :         for (; n < nWordCount - 7; n += 8)
    2600             :         {
    2601          48 :             __m128i xmm = _mm_loadu_si128(
    2602          48 :                 reinterpret_cast<const __m128i *>(pSrcData + n));
    2603          48 :             __m128i xmm0 = _mm_unpacklo_epi16(xmm, xmm_zero);
    2604          48 :             __m128i xmm1 = _mm_unpackhi_epi16(xmm, xmm_zero);
    2605             : 
    2606          48 :             __m128d xmm0_low_d = _mm_cvtepi32_pd(xmm0);
    2607          48 :             __m128d xmm1_low_d = _mm_cvtepi32_pd(xmm1);
    2608          48 :             xmm0 = _mm_srli_si128(xmm0, 8);
    2609          48 :             xmm1 = _mm_srli_si128(xmm1, 8);
    2610          48 :             __m128d xmm0_high_d = _mm_cvtepi32_pd(xmm0);
    2611          48 :             __m128d xmm1_high_d = _mm_cvtepi32_pd(xmm1);
    2612             : 
    2613          48 :             _mm_storeu_pd(reinterpret_cast<double *>(pabyDstDataPtr + n * 8),
    2614             :                           xmm0_low_d);
    2615             :             _mm_storeu_pd(
    2616          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 16),
    2617             :                 xmm0_high_d);
    2618             :             _mm_storeu_pd(
    2619          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 32),
    2620             :                 xmm1_low_d);
    2621             :             _mm_storeu_pd(
    2622          48 :                 reinterpret_cast<double *>(pabyDstDataPtr + n * 8 + 48),
    2623             :                 xmm1_high_d);
    2624             :         }
    2625         429 :         for (; n < nWordCount; n++)
    2626             :         {
    2627         258 :             pDstData[n] = pSrcData[n];
    2628         171 :         }
    2629             :     }
    2630             :     else
    2631             :     {
    2632         108 :         GDALCopyWordsGenericT(pSrcData, nSrcPixelStride, pDstData,
    2633             :                               nDstPixelStride, nWordCount);
    2634             :     }
    2635         279 : }
    2636             : 
    2637             : template <>
    2638         811 : void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
    2639             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2640             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2641             : {
    2642         811 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2643             :                             nDstPixelStride, nWordCount);
    2644         811 : }
    2645             : 
    2646             : #endif  // HAVE_SSE2
    2647             : 
    2648             : template <>
    2649      116766 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2650             :                     int nSrcPixelStride, GByte *const CPL_RESTRICT pDstData,
    2651             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2652             : {
    2653      116766 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2654             :                             nDstPixelStride, nWordCount);
    2655      116766 : }
    2656             : 
    2657             : template <>
    2658       15146 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2659             :                     int nSrcPixelStride, GInt16 *const CPL_RESTRICT pDstData,
    2660             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2661             : {
    2662       15146 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2663             :                             nDstPixelStride, nWordCount);
    2664       15146 : }
    2665             : 
    2666             : template <>
    2667       61645 : void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
    2668             :                     int nSrcPixelStride, GUInt16 *const CPL_RESTRICT pDstData,
    2669             :                     int nDstPixelStride, GPtrDiff_t nWordCount)
    2670             : {
    2671       61645 :     GDALCopyWordsT_8atatime(pSrcData, nSrcPixelStride, pDstData,
    2672             :                             nDstPixelStride, nWordCount);
    2673       61639 : }
    2674             : 
    2675             : /************************************************************************/
    2676             : /*                   GDALCopyWordsComplexT()                            */
    2677             : /************************************************************************/
    2678             : /**
    2679             :  * Template function, used to copy data from pSrcData into buffer
    2680             :  * pDstData, with stride nSrcPixelStride in the source data and
    2681             :  * stride nDstPixelStride in the destination data. Deals with the
    2682             :  * complex case, where input is complex and output is complex.
    2683             :  *
    2684             :  * @param pSrcData the source data buffer
    2685             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2686             :  *                      of interest.
    2687             :  * @param pDstData the destination buffer.
    2688             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2689             :  *                      interest.
    2690             :  * @param nWordCount the total number of pixel words to copy
    2691             :  *
    2692             :  */
    2693             : template <class Tin, class Tout>
    2694      125172 : inline void GDALCopyWordsComplexT(const Tin *const CPL_RESTRICT pSrcData,
    2695             :                                   int nSrcPixelStride,
    2696             :                                   Tout *const CPL_RESTRICT pDstData,
    2697             :                                   int nDstPixelStride, GPtrDiff_t nWordCount)
    2698             : {
    2699      125172 :     decltype(nWordCount) nDstOffset = 0;
    2700      125172 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2701      125172 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2702             : 
    2703     7337873 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2704             :     {
    2705     7212696 :         const Tin *const pPixelIn =
    2706     7212696 :             reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    2707     7212696 :         Tout *const pPixelOut =
    2708     7212696 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2709             : 
    2710     7212696 :         GDALCopyWord(pPixelIn[0], pPixelOut[0]);
    2711     7212696 :         GDALCopyWord(pPixelIn[1], pPixelOut[1]);
    2712             : 
    2713     7212696 :         nDstOffset += nDstPixelStride;
    2714             :     }
    2715      125172 : }
    2716             : 
    2717             : /************************************************************************/
    2718             : /*                   GDALCopyWordsComplexOutT()                         */
    2719             : /************************************************************************/
    2720             : /**
    2721             :  * Template function, used to copy data from pSrcData into buffer
    2722             :  * pDstData, with stride nSrcPixelStride in the source data and
    2723             :  * stride nDstPixelStride in the destination data. Deals with the
    2724             :  * case where the value is real coming in, but complex going out.
    2725             :  *
    2726             :  * @param pSrcData the source data buffer
    2727             :  * @param nSrcPixelStride the stride, in the buffer pSrcData for pixels
    2728             :  *                      of interest, in bytes.
    2729             :  * @param pDstData the destination buffer.
    2730             :  * @param nDstPixelStride the stride in the buffer pDstData for pixels of
    2731             :  *                      interest, in bytes.
    2732             :  * @param nWordCount the total number of pixel words to copy
    2733             :  *
    2734             :  */
    2735             : template <class Tin, class Tout>
    2736        3168 : inline void GDALCopyWordsComplexOutT(const Tin *const CPL_RESTRICT pSrcData,
    2737             :                                      int nSrcPixelStride,
    2738             :                                      Tout *const CPL_RESTRICT pDstData,
    2739             :                                      int nDstPixelStride, GPtrDiff_t nWordCount)
    2740             : {
    2741        3168 :     decltype(nWordCount) nDstOffset = 0;
    2742             : 
    2743        3168 :     const Tout tOutZero = static_cast<Tout>(0);
    2744             : 
    2745        3168 :     const char *const pSrcDataPtr = reinterpret_cast<const char *>(pSrcData);
    2746        3168 :     char *const pDstDataPtr = reinterpret_cast<char *>(pDstData);
    2747             : 
    2748     1112479 :     for (decltype(nWordCount) n = 0; n < nWordCount; n++)
    2749             :     {
    2750     1109311 :         const Tin tValue =
    2751     1109311 :             *reinterpret_cast<const Tin *>(pSrcDataPtr + n * nSrcPixelStride);
    2752     1109311 :         Tout *const pPixelOut =
    2753     1109311 :             reinterpret_cast<Tout *>(pDstDataPtr + nDstOffset);
    2754     1109311 :         GDALCopyWord(tValue, *pPixelOut);
    2755             : 
    2756     1109311 :         pPixelOut[1] = tOutZero;
    2757             : 
    2758     1109311 :         nDstOffset += nDstPixelStride;
    2759             :     }
    2760        3168 : }
    2761             : 
    2762             : /************************************************************************/
    2763             : /*                           GDALCopyWordsFromT()                       */
    2764             : /************************************************************************/
    2765             : /**
    2766             :  * Template driver function. Given the input type T, call the appropriate
    2767             :  * GDALCopyWordsT function template for the desired output type. You should
    2768             :  * never call this function directly (call GDALCopyWords instead).
    2769             :  *
    2770             :  * @param pSrcData source data buffer
    2771             :  * @param nSrcPixelStride pixel stride in input buffer, in pixel words
    2772             :  * @param bInComplex input is complex
    2773             :  * @param pDstData destination data buffer
    2774             :  * @param eDstType destination data type
    2775             :  * @param nDstPixelStride pixel stride in output buffer, in pixel words
    2776             :  * @param nWordCount number of pixel words to be copied
    2777             :  */
    2778             : template <class T>
    2779    53542097 : inline void GDALCopyWordsFromT(const T *const CPL_RESTRICT pSrcData,
    2780             :                                int nSrcPixelStride, bool bInComplex,
    2781             :                                void *CPL_RESTRICT pDstData,
    2782             :                                GDALDataType eDstType, int nDstPixelStride,
    2783             :                                GPtrDiff_t nWordCount)
    2784             : {
    2785    53542097 :     switch (eDstType)
    2786             :     {
    2787     4557959 :         case GDT_Byte:
    2788     4557959 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2789             :                            static_cast<unsigned char *>(pDstData),
    2790             :                            nDstPixelStride, nWordCount);
    2791     4558149 :             break;
    2792         458 :         case GDT_Int8:
    2793         458 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2794             :                            static_cast<signed char *>(pDstData),
    2795             :                            nDstPixelStride, nWordCount);
    2796         458 :             break;
    2797      101145 :         case GDT_UInt16:
    2798      101145 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2799             :                            static_cast<unsigned short *>(pDstData),
    2800             :                            nDstPixelStride, nWordCount);
    2801      101140 :             break;
    2802     4126377 :         case GDT_Int16:
    2803     4126377 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2804             :                            static_cast<short *>(pDstData), nDstPixelStride,
    2805             :                            nWordCount);
    2806     4126377 :             break;
    2807        4180 :         case GDT_UInt32:
    2808        4180 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2809             :                            static_cast<unsigned int *>(pDstData),
    2810             :                            nDstPixelStride, nWordCount);
    2811        4180 :             break;
    2812    25496917 :         case GDT_Int32:
    2813    25496917 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2814             :                            static_cast<int *>(pDstData), nDstPixelStride,
    2815             :                            nWordCount);
    2816    25509715 :             break;
    2817         593 :         case GDT_UInt64:
    2818         593 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2819             :                            static_cast<std::uint64_t *>(pDstData),
    2820             :                            nDstPixelStride, nWordCount);
    2821         593 :             break;
    2822        4158 :         case GDT_Int64:
    2823        4158 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2824             :                            static_cast<std::int64_t *>(pDstData),
    2825             :                            nDstPixelStride, nWordCount);
    2826        4158 :             break;
    2827     3869493 :         case GDT_Float32:
    2828     3869493 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2829             :                            static_cast<float *>(pDstData), nDstPixelStride,
    2830             :                            nWordCount);
    2831     3869493 :             break;
    2832    15244911 :         case GDT_Float64:
    2833    15244911 :             GDALCopyWordsT(pSrcData, nSrcPixelStride,
    2834             :                            static_cast<double *>(pDstData), nDstPixelStride,
    2835             :                            nWordCount);
    2836    15244901 :             break;
    2837      122401 :         case GDT_CInt16:
    2838      122401 :             if (bInComplex)
    2839             :             {
    2840      121390 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2841             :                                       static_cast<short *>(pDstData),
    2842             :                                       nDstPixelStride, nWordCount);
    2843             :             }
    2844             :             else  // input is not complex, so we need to promote to a complex
    2845             :                   // buffer
    2846             :             {
    2847        1011 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2848             :                                          static_cast<short *>(pDstData),
    2849             :                                          nDstPixelStride, nWordCount);
    2850             :             }
    2851      122401 :             break;
    2852         800 :         case GDT_CInt32:
    2853         800 :             if (bInComplex)
    2854             :             {
    2855         411 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2856             :                                       static_cast<int *>(pDstData),
    2857             :                                       nDstPixelStride, nWordCount);
    2858             :             }
    2859             :             else  // input is not complex, so we need to promote to a complex
    2860             :                   // buffer
    2861             :             {
    2862         389 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2863             :                                          static_cast<int *>(pDstData),
    2864             :                                          nDstPixelStride, nWordCount);
    2865             :             }
    2866         800 :             break;
    2867        3171 :         case GDT_CFloat32:
    2868        3171 :             if (bInComplex)
    2869             :             {
    2870        2589 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2871             :                                       static_cast<float *>(pDstData),
    2872             :                                       nDstPixelStride, nWordCount);
    2873             :             }
    2874             :             else  // input is not complex, so we need to promote to a complex
    2875             :                   // buffer
    2876             :             {
    2877         582 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2878             :                                          static_cast<float *>(pDstData),
    2879             :                                          nDstPixelStride, nWordCount);
    2880             :             }
    2881        3171 :             break;
    2882        1968 :         case GDT_CFloat64:
    2883        1968 :             if (bInComplex)
    2884             :             {
    2885         782 :                 GDALCopyWordsComplexT(pSrcData, nSrcPixelStride,
    2886             :                                       static_cast<double *>(pDstData),
    2887             :                                       nDstPixelStride, nWordCount);
    2888             :             }
    2889             :             else  // input is not complex, so we need to promote to a complex
    2890             :                   // buffer
    2891             :             {
    2892        1186 :                 GDALCopyWordsComplexOutT(pSrcData, nSrcPixelStride,
    2893             :                                          static_cast<double *>(pDstData),
    2894             :                                          nDstPixelStride, nWordCount);
    2895             :             }
    2896        1968 :             break;
    2897           0 :         case GDT_Unknown:
    2898             :         case GDT_TypeCount:
    2899           0 :             CPLAssert(false);
    2900             :     }
    2901    53555081 : }
    2902             : 
    2903             : }  // end anonymous namespace
    2904             : 
    2905             : /************************************************************************/
    2906             : /*                          GDALReplicateWord()                         */
    2907             : /************************************************************************/
    2908             : 
    2909             : template <class T>
    2910      528127 : inline void GDALReplicateWordT(void *pDstData, int nDstPixelStride,
    2911             :                                GPtrDiff_t nWordCount)
    2912             : {
    2913      528127 :     const T valSet = *static_cast<const T *>(pDstData);
    2914      528127 :     if (nDstPixelStride == static_cast<int>(sizeof(T)))
    2915             :     {
    2916      499722 :         T *pDstPtr = static_cast<T *>(pDstData) + 1;
    2917    20687937 :         while (nWordCount >= 4)
    2918             :         {
    2919    20188224 :             nWordCount -= 4;
    2920    20188224 :             pDstPtr[0] = valSet;
    2921    20188224 :             pDstPtr[1] = valSet;
    2922    20188224 :             pDstPtr[2] = valSet;
    2923    20188224 :             pDstPtr[3] = valSet;
    2924    20188224 :             pDstPtr += 4;
    2925             :         }
    2926     1267522 :         while (nWordCount > 0)
    2927             :         {
    2928      767800 :             --nWordCount;
    2929      767800 :             *pDstPtr = valSet;
    2930      767800 :             pDstPtr++;
    2931             :         }
    2932             :     }
    2933             :     else
    2934             :     {
    2935       28407 :         GByte *pabyDstPtr = static_cast<GByte *>(pDstData) + nDstPixelStride;
    2936      954322 :         while (nWordCount > 0)
    2937             :         {
    2938      925915 :             --nWordCount;
    2939      925915 :             *reinterpret_cast<T *>(pabyDstPtr) = valSet;
    2940      925915 :             pabyDstPtr += nDstPixelStride;
    2941             :         }
    2942             :     }
    2943      528127 : }
    2944             : 
    2945      906316 : static void GDALReplicateWord(const void *CPL_RESTRICT pSrcData,
    2946             :                               GDALDataType eSrcType,
    2947             :                               void *CPL_RESTRICT pDstData,
    2948             :                               GDALDataType eDstType, int nDstPixelStride,
    2949             :                               GPtrDiff_t nWordCount)
    2950             : {
    2951             :     /* -----------------------------------------------------------------------
    2952             :      */
    2953             :     /* Special case when the source data is always the same value */
    2954             :     /* (for VRTSourcedRasterBand::IRasterIO and
    2955             :      * VRTDerivedRasterBand::IRasterIO*/
    2956             :     /*  for example) */
    2957             :     /* -----------------------------------------------------------------------
    2958             :      */
    2959             :     // Let the general translation case do the necessary conversions
    2960             :     // on the first destination element.
    2961      906316 :     GDALCopyWords64(pSrcData, eSrcType, 0, pDstData, eDstType, 0, 1);
    2962             : 
    2963             :     // Now copy the first element to the nWordCount - 1 following destination
    2964             :     // elements.
    2965      906206 :     nWordCount--;
    2966      906206 :     GByte *pabyDstWord = reinterpret_cast<GByte *>(pDstData) + nDstPixelStride;
    2967             : 
    2968      906206 :     switch (eDstType)
    2969             :     {
    2970      377978 :         case GDT_Byte:
    2971             :         case GDT_Int8:
    2972             :         {
    2973      377978 :             if (nDstPixelStride == 1)
    2974             :             {
    2975      344226 :                 if (nWordCount > 0)
    2976      344226 :                     memset(pabyDstWord,
    2977      344226 :                            *reinterpret_cast<const GByte *>(pDstData),
    2978             :                            nWordCount);
    2979             :             }
    2980             :             else
    2981             :             {
    2982       33752 :                 GByte valSet = *reinterpret_cast<const GByte *>(pDstData);
    2983     5438530 :                 while (nWordCount > 0)
    2984             :                 {
    2985     5404780 :                     --nWordCount;
    2986     5404780 :                     *pabyDstWord = valSet;
    2987     5404780 :                     pabyDstWord += nDstPixelStride;
    2988             :                 }
    2989             :             }
    2990      377978 :             break;
    2991             :         }
    2992             : 
    2993             : #define CASE_DUPLICATE_SIMPLE(enum_type, c_type)                               \
    2994             :     case enum_type:                                                            \
    2995             :     {                                                                          \
    2996             :         GDALReplicateWordT<c_type>(pDstData, nDstPixelStride, nWordCount);     \
    2997             :         break;                                                                 \
    2998             :     }
    2999             : 
    3000         354 :             CASE_DUPLICATE_SIMPLE(GDT_UInt16, GUInt16)
    3001      169653 :             CASE_DUPLICATE_SIMPLE(GDT_Int16, GInt16)
    3002          56 :             CASE_DUPLICATE_SIMPLE(GDT_UInt32, GUInt32)
    3003      300130 :             CASE_DUPLICATE_SIMPLE(GDT_Int32, GInt32)
    3004          21 :             CASE_DUPLICATE_SIMPLE(GDT_UInt64, std::uint64_t)
    3005         662 :             CASE_DUPLICATE_SIMPLE(GDT_Int64, std::int64_t)
    3006       52216 :             CASE_DUPLICATE_SIMPLE(GDT_Float32, float)
    3007        5049 :             CASE_DUPLICATE_SIMPLE(GDT_Float64, double)
    3008             : 
    3009             : #define CASE_DUPLICATE_COMPLEX(enum_type, c_type)                              \
    3010             :     case enum_type:                                                            \
    3011             :     {                                                                          \
    3012             :         c_type valSet1 = reinterpret_cast<const c_type *>(pDstData)[0];        \
    3013             :         c_type valSet2 = reinterpret_cast<const c_type *>(pDstData)[1];        \
    3014             :         while (nWordCount > 0)                                                 \
    3015             :         {                                                                      \
    3016             :             --nWordCount;                                                      \
    3017             :             reinterpret_cast<c_type *>(pabyDstWord)[0] = valSet1;              \
    3018             :             reinterpret_cast<c_type *>(pabyDstWord)[1] = valSet2;              \
    3019             :             pabyDstWord += nDstPixelStride;                                    \
    3020             :         }                                                                      \
    3021             :         break;                                                                 \
    3022             :     }
    3023             : 
    3024         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt16, GInt16)
    3025         784 :             CASE_DUPLICATE_COMPLEX(GDT_CInt32, GInt32)
    3026         784 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat32, float)
    3027         784 :             CASE_DUPLICATE_COMPLEX(GDT_CFloat64, double)
    3028             : 
    3029           0 :         case GDT_Unknown:
    3030             :         case GDT_TypeCount:
    3031           0 :             CPLAssert(false);
    3032             :     }
    3033      906300 : }
    3034             : 
    3035             : /************************************************************************/
    3036             : /*                        GDALUnrolledCopy()                            */
    3037             : /************************************************************************/
    3038             : 
    3039             : template <class T, int srcStride, int dstStride>
    3040     5329411 : static inline void GDALUnrolledCopyGeneric(T *CPL_RESTRICT pDest,
    3041             :                                            const T *CPL_RESTRICT pSrc,
    3042             :                                            GPtrDiff_t nIters)
    3043             : {
    3044     5329411 :     if (nIters >= 16)
    3045             :     {
    3046   138285268 :         for (GPtrDiff_t i = nIters / 16; i != 0; i--)
    3047             :         {
    3048   133085684 :             pDest[0 * dstStride] = pSrc[0 * srcStride];
    3049   133085684 :             pDest[1 * dstStride] = pSrc[1 * srcStride];
    3050   133085684 :             pDest[2 * dstStride] = pSrc[2 * srcStride];
    3051   133085684 :             pDest[3 * dstStride] = pSrc[3 * srcStride];
    3052   133085684 :             pDest[4 * dstStride] = pSrc[4 * srcStride];
    3053   133085684 :             pDest[5 * dstStride] = pSrc[5 * srcStride];
    3054   133085684 :             pDest[6 * dstStride] = pSrc[6 * srcStride];
    3055   133085684 :             pDest[7 * dstStride] = pSrc[7 * srcStride];
    3056   133085684 :             pDest[8 * dstStride] = pSrc[8 * srcStride];
    3057   133085684 :             pDest[9 * dstStride] = pSrc[9 * srcStride];
    3058   133085684 :             pDest[10 * dstStride] = pSrc[10 * srcStride];
    3059   133085684 :             pDest[11 * dstStride] = pSrc[11 * srcStride];
    3060   133085684 :             pDest[12 * dstStride] = pSrc[12 * srcStride];
    3061   133085684 :             pDest[13 * dstStride] = pSrc[13 * srcStride];
    3062   133085684 :             pDest[14 * dstStride] = pSrc[14 * srcStride];
    3063   133085684 :             pDest[15 * dstStride] = pSrc[15 * srcStride];
    3064   133085684 :             pDest += 16 * dstStride;
    3065   133085684 :             pSrc += 16 * srcStride;
    3066             :         }
    3067     5199632 :         nIters = nIters % 16;
    3068             :     }
    3069     7591706 :     for (GPtrDiff_t i = 0; i < nIters; i++)
    3070             :     {
    3071     2262300 :         pDest[i * dstStride] = *pSrc;
    3072     2262300 :         pSrc += srcStride;
    3073             :     }
    3074     5329411 : }
    3075             : 
    3076             : template <class T, int srcStride, int dstStride>
    3077     5324211 : static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
    3078             :                                     const T *CPL_RESTRICT pSrc,
    3079             :                                     GPtrDiff_t nIters)
    3080             : {
    3081     5324211 :     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
    3082     5324231 : }
    3083             : 
    3084             : #ifdef HAVE_SSE2
    3085             : 
    3086             : template <>
    3087      303985 : void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
    3088             :                                    const GByte *CPL_RESTRICT pSrc,
    3089             :                                    GPtrDiff_t nIters)
    3090             : {
    3091      303985 :     decltype(nIters) i = 0;
    3092      303985 :     if (nIters > 16)
    3093             :     {
    3094      145815 :         const __m128i xmm_mask = _mm_set1_epi16(0xff);
    3095             :         // If we were sure that there would always be 1 trailing byte, we could
    3096             :         // check against nIters - 15
    3097     2544120 :         for (; i < nIters - 16; i += 16)
    3098             :         {
    3099             :             __m128i xmm0 =
    3100     2398300 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3101             :             __m128i xmm1 =
    3102     4796610 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3103             :             // Set higher 8bit of each int16 packed word to 0
    3104     2398300 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3105     2398300 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3106             :             // Pack int16 to uint8 and merge back both vector
    3107     2398300 :             xmm0 = _mm_packus_epi16(xmm0, xmm1);
    3108             : 
    3109             :             // Store result
    3110     2398300 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3111             : 
    3112     2398300 :             pSrc += 2 * 16;
    3113             :         }
    3114             :     }
    3115     3875160 :     for (; i < nIters; i++)
    3116             :     {
    3117     3571180 :         pDest[i] = *pSrc;
    3118     3571180 :         pSrc += 2;
    3119             :     }
    3120      303985 : }
    3121             : 
    3122             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    3123             : 
    3124             : template <>
    3125      184630 : void GDALUnrolledCopy<GByte, 3, 1>(GByte *CPL_RESTRICT pDest,
    3126             :                                    const GByte *CPL_RESTRICT pSrc,
    3127             :                                    GPtrDiff_t nIters)
    3128             : {
    3129      184630 :     if (nIters > 16 && CPLHaveRuntimeSSSE3())
    3130             :     {
    3131      179430 :         GDALUnrolledCopy_GByte_3_1_SSSE3(pDest, pSrc, nIters);
    3132             :     }
    3133             :     else
    3134             :     {
    3135        5200 :         GDALUnrolledCopyGeneric<GByte, 3, 1>(pDest, pSrc, nIters);
    3136             :     }
    3137      184630 : }
    3138             : 
    3139             : #endif
    3140             : 
    3141             : template <>
    3142      105200 : void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
    3143             :                                    const GByte *CPL_RESTRICT pSrc,
    3144             :                                    GPtrDiff_t nIters)
    3145             : {
    3146      105200 :     decltype(nIters) i = 0;
    3147      105200 :     if (nIters > 16)
    3148             :     {
    3149       99907 :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    3150             :         // If we were sure that there would always be 3 trailing bytes, we could
    3151             :         // check against nIters - 15
    3152     8826220 :         for (; i < nIters - 16; i += 16)
    3153             :         {
    3154             :             __m128i xmm0 =
    3155     8725980 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
    3156             :             __m128i xmm1 =
    3157     8725980 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
    3158             :             __m128i xmm2 =
    3159     8725980 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
    3160             :             __m128i xmm3 =
    3161    17452000 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 48));
    3162             :             // Set higher 24bit of each int32 packed word to 0
    3163     8725980 :             xmm0 = _mm_and_si128(xmm0, xmm_mask);
    3164     8725980 :             xmm1 = _mm_and_si128(xmm1, xmm_mask);
    3165     8725980 :             xmm2 = _mm_and_si128(xmm2, xmm_mask);
    3166     8725980 :             xmm3 = _mm_and_si128(xmm3, xmm_mask);
    3167             :             // Pack int32 to int16
    3168     8726510 :             xmm0 = _mm_packs_epi32(xmm0, xmm1);
    3169     8726450 :             xmm2 = _mm_packs_epi32(xmm2, xmm3);
    3170             :             // Pack int16 to uint8
    3171     8726310 :             xmm0 = _mm_packus_epi16(xmm0, xmm2);
    3172             : 
    3173             :             // Store result
    3174     8726310 :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
    3175             : 
    3176     8726310 :             pSrc += 4 * 16;
    3177             :         }
    3178             :     }
    3179     1118790 :     for (; i < nIters; i++)
    3180             :     {
    3181     1013250 :         pDest[i] = *pSrc;
    3182     1013250 :         pSrc += 4;
    3183             :     }
    3184      105535 : }
    3185             : #endif  // HAVE_SSE2
    3186             : 
    3187             : /************************************************************************/
    3188             : /*                         GDALFastCopy()                               */
    3189             : /************************************************************************/
    3190             : 
    3191             : template <class T>
    3192    39811900 : static inline void GDALFastCopy(T *CPL_RESTRICT pDest, int nDestStride,
    3193             :                                 const T *CPL_RESTRICT pSrc, int nSrcStride,
    3194             :                                 GPtrDiff_t nIters)
    3195             : {
    3196    39811900 :     constexpr int sizeofT = static_cast<int>(sizeof(T));
    3197    39811900 :     if (nIters == 1)
    3198             :     {
    3199    22302140 :         *pDest = *pSrc;
    3200             :     }
    3201    17509816 :     else if (nDestStride == sizeofT)
    3202             :     {
    3203    12248223 :         if (nSrcStride == sizeofT)
    3204             :         {
    3205    11512790 :             memcpy(pDest, pSrc, nIters * sizeof(T));
    3206             :         }
    3207      735430 :         else if (nSrcStride == 2 * sizeofT)
    3208             :         {
    3209      306938 :             GDALUnrolledCopy<T, 2, 1>(pDest, pSrc, nIters);
    3210             :         }
    3211      428492 :         else if (nSrcStride == 3 * sizeofT)
    3212             :         {
    3213      289938 :             GDALUnrolledCopy<T, 3, 1>(pDest, pSrc, nIters);
    3214             :         }
    3215      138554 :         else if (nSrcStride == 4 * sizeofT)
    3216             :         {
    3217      134068 :             GDALUnrolledCopy<T, 4, 1>(pDest, pSrc, nIters);
    3218             :         }
    3219             :         else
    3220             :         {
    3221    12978040 :             while (nIters-- > 0)
    3222             :             {
    3223    12973530 :                 *pDest = *pSrc;
    3224    12973530 :                 pSrc += nSrcStride / sizeofT;
    3225    12973530 :                 pDest++;
    3226             :             }
    3227             :         }
    3228             :     }
    3229     5261553 :     else if (nSrcStride == sizeofT)
    3230             :     {
    3231     5246505 :         if (nDestStride == 2 * sizeofT)
    3232             :         {
    3233      129155 :             GDALUnrolledCopy<T, 1, 2>(pDest, pSrc, nIters);
    3234             :         }
    3235     5117350 :         else if (nDestStride == 3 * sizeofT)
    3236             :         {
    3237     4410211 :             GDALUnrolledCopy<T, 1, 3>(pDest, pSrc, nIters);
    3238             :         }
    3239      707143 :         else if (nDestStride == 4 * sizeofT)
    3240             :         {
    3241      647716 :             GDALUnrolledCopy<T, 1, 4>(pDest, pSrc, nIters);
    3242             :         }
    3243             :         else
    3244             :         {
    3245    12650500 :             while (nIters-- > 0)
    3246             :             {
    3247    12591080 :                 *pDest = *pSrc;
    3248    12591080 :                 pSrc++;
    3249    12591080 :                 pDest += nDestStride / sizeofT;
    3250             :             }
    3251             :         }
    3252             :     }
    3253             :     else
    3254             :     {
    3255     1113938 :         while (nIters-- > 0)
    3256             :         {
    3257     1098888 :             *pDest = *pSrc;
    3258     1098888 :             pSrc += nSrcStride / sizeofT;
    3259     1098888 :             pDest += nDestStride / sizeofT;
    3260             :         }
    3261             :     }
    3262    39811900 : }
    3263             : 
    3264             : /************************************************************************/
    3265             : /*                         GDALFastCopyByte()                           */
    3266             : /************************************************************************/
    3267             : 
    3268      276287 : static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
    3269             :                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
    3270             :                              int nDstPixelStride, GPtrDiff_t nWordCount)
    3271             : {
    3272      276287 :     GDALFastCopy(pDstData, nDstPixelStride, pSrcData, nSrcPixelStride,
    3273             :                  nWordCount);
    3274      276287 : }
    3275             : 
    3276             : /************************************************************************/
    3277             : /*                           GDALCopyWords()                            */
    3278             : /************************************************************************/
    3279             : 
    3280             : /**
    3281             :  * Copy pixel words from buffer to buffer.
    3282             :  *
    3283             :  * @see GDALCopyWords64()
    3284             :  */
    3285    87098000 : void CPL_STDCALL GDALCopyWords(const void *CPL_RESTRICT pSrcData,
    3286             :                                GDALDataType eSrcType, int nSrcPixelStride,
    3287             :                                void *CPL_RESTRICT pDstData,
    3288             :                                GDALDataType eDstType, int nDstPixelStride,
    3289             :                                int nWordCount)
    3290             : {
    3291    87098000 :     GDALCopyWords64(pSrcData, eSrcType, nSrcPixelStride, pDstData, eDstType,
    3292             :                     nDstPixelStride, nWordCount);
    3293    87095400 : }
    3294             : 
    3295             : /************************************************************************/
    3296             : /*                          GDALCopyWords64()                           */
    3297             : /************************************************************************/
    3298             : 
    3299             : /**
    3300             :  * Copy pixel words from buffer to buffer.
    3301             :  *
    3302             :  * This function is used to copy pixel word values from one memory buffer
    3303             :  * to another, with support for conversion between data types, and differing
    3304             :  * step factors. The data type conversion is done using the following
    3305             :  * rules:
    3306             :  * <ul>
    3307             :  * <li>Values assigned to a lower range integer type are clipped. For
    3308             :  * instance assigning GDT_Int16 values to a GDT_Byte buffer will cause values
    3309             :  * less the 0 to be set to 0, and values larger than 255 to be set to 255.
    3310             :  * </li>
    3311             :  * <li>
    3312             :  * Assignment from floating point to integer rounds to closest integer.
    3313             :  * +Infinity is mapped to the largest integer. -Infinity is mapped to the
    3314             :  * smallest integer. NaN is mapped to 0.
    3315             :  * </li>
    3316             :  * <li>
    3317             :  * Assignment from non-complex to complex will result in the imaginary part
    3318             :  * being set to zero on output.
    3319             :  * </li>
    3320             :  * <li> Assignment from complex to
    3321             :  * non-complex will result in the complex portion being lost and the real
    3322             :  * component being preserved (<i>not magnitude!</i>).
    3323             :  * </li>
    3324             :  * </ul>
    3325             :  *
    3326             :  * No assumptions are made about the source or destination words occurring
    3327             :  * on word boundaries.  It is assumed that all values are in native machine
    3328             :  * byte order.
    3329             :  *
    3330             :  * @param pSrcData Pointer to source data to be converted.
    3331             :  * @param eSrcType the source data type (see GDALDataType enum)
    3332             :  * @param nSrcPixelStride Source pixel stride (i.e. distance between 2 words),
    3333             :  * in bytes
    3334             :  * @param pDstData Pointer to buffer where destination data should go
    3335             :  * @param eDstType the destination data type (see GDALDataType enum)
    3336             :  * @param nDstPixelStride Destination pixel stride (i.e. distance between 2
    3337             :  * words), in bytes
    3338             :  * @param nWordCount number of words to be copied
    3339             :  *
    3340             :  * @note
    3341             :  * When adding a new data type to GDAL, you must do the following to
    3342             :  * support it properly within the GDALCopyWords function:
    3343             :  * 1. Add the data type to the switch on eSrcType in GDALCopyWords.
    3344             :  *    This should invoke the appropriate GDALCopyWordsFromT wrapper.
    3345             :  * 2. Add the data type to the switch on eDstType in GDALCopyWordsFromT.
    3346             :  *    This should call the appropriate GDALCopyWordsT template.
    3347             :  * 3. If appropriate, overload the appropriate CopyWord template in the
    3348             :  *    above namespace. This will ensure that any conversion issues are
    3349             :  *    handled (cases like the float -> int32 case, where the min/max)
    3350             :  *    values are subject to roundoff error.
    3351             :  */
    3352             : 
    3353   108301000 : void CPL_STDCALL GDALCopyWords64(const void *CPL_RESTRICT pSrcData,
    3354             :                                  GDALDataType eSrcType, int nSrcPixelStride,
    3355             :                                  void *CPL_RESTRICT pDstData,
    3356             :                                  GDALDataType eDstType, int nDstPixelStride,
    3357             :                                  GPtrDiff_t nWordCount)
    3358             : 
    3359             : {
    3360             :     // On platforms where alignment matters, be careful
    3361   108301000 :     const int nSrcDataTypeSize = GDALGetDataTypeSizeBytes(eSrcType);
    3362   108285000 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(eDstType);
    3363   108286000 :     if (CPL_UNLIKELY(nSrcDataTypeSize == 0 || nDstDataTypeSize == 0))
    3364             :     {
    3365           2 :         CPLError(CE_Failure, CPLE_NotSupported,
    3366             :                  "GDALCopyWords64(): unsupported GDT_Unknown/GDT_TypeCount "
    3367             :                  "argument");
    3368           2 :         return;
    3369             :     }
    3370   108286000 :     if (!(eSrcType == eDstType && nSrcPixelStride == nDstPixelStride) &&
    3371    60236000 :         ((reinterpret_cast<uintptr_t>(pSrcData) % nSrcDataTypeSize) != 0 ||
    3372    60234200 :          (reinterpret_cast<uintptr_t>(pDstData) % nDstDataTypeSize) != 0 ||
    3373    60226200 :          (nSrcPixelStride % nSrcDataTypeSize) != 0 ||
    3374    60222900 :          (nDstPixelStride % nDstDataTypeSize) != 0))
    3375             :     {
    3376         905 :         if (eSrcType == eDstType)
    3377             :         {
    3378       34800 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3379             :             {
    3380       34000 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3381             :                        static_cast<const GByte *>(pSrcData) +
    3382       34000 :                            nSrcPixelStride * i,
    3383             :                        nDstDataTypeSize);
    3384             :             }
    3385             :         }
    3386             :         else
    3387             :         {
    3388         210 :             const auto getAlignedPtr = [](GByte *ptr, int align)
    3389             :             {
    3390             :                 return ptr +
    3391         210 :                        ((align - (reinterpret_cast<uintptr_t>(ptr) % align)) %
    3392         210 :                         align);
    3393             :             };
    3394             : 
    3395             :             // The largest we need is for CFloat64 (16 bytes), so 32 bytes to
    3396             :             // be sure to get correctly aligned pointer.
    3397         105 :             constexpr size_t SIZEOF_CFLOAT64 = 2 * sizeof(double);
    3398             :             GByte abySrcBuffer[2 * SIZEOF_CFLOAT64];
    3399             :             GByte abyDstBuffer[2 * SIZEOF_CFLOAT64];
    3400             :             GByte *pabySrcBuffer =
    3401         105 :                 getAlignedPtr(abySrcBuffer, nSrcDataTypeSize);
    3402             :             GByte *pabyDstBuffer =
    3403         105 :                 getAlignedPtr(abyDstBuffer, nDstDataTypeSize);
    3404        3360 :             for (decltype(nWordCount) i = 0; i < nWordCount; i++)
    3405             :             {
    3406        3255 :                 memcpy(pabySrcBuffer,
    3407             :                        static_cast<const GByte *>(pSrcData) +
    3408        3255 :                            nSrcPixelStride * i,
    3409             :                        nSrcDataTypeSize);
    3410        3255 :                 GDALCopyWords64(pabySrcBuffer, eSrcType, 0, pabyDstBuffer,
    3411             :                                 eDstType, 0, 1);
    3412        3255 :                 memcpy(static_cast<GByte *>(pDstData) + nDstPixelStride * i,
    3413             :                        pabyDstBuffer, nDstDataTypeSize);
    3414             :             }
    3415             :         }
    3416         905 :         return;
    3417             :     }
    3418             : 
    3419             :     // Deal with the case where we're replicating a single word into the
    3420             :     // provided buffer
    3421   108285000 :     if (nSrcPixelStride == 0 && nWordCount > 1)
    3422             :     {
    3423      906311 :         GDALReplicateWord(pSrcData, eSrcType, pDstData, eDstType,
    3424             :                           nDstPixelStride, nWordCount);
    3425      906328 :         return;
    3426             :     }
    3427             : 
    3428   107379000 :     if (eSrcType == eDstType)
    3429             :     {
    3430    54005300 :         if (eSrcType == GDT_Byte || eSrcType == GDT_Int8)
    3431             :         {
    3432    18570200 :             GDALFastCopy(static_cast<GByte *>(pDstData), nDstPixelStride,
    3433             :                          static_cast<const GByte *>(pSrcData), nSrcPixelStride,
    3434             :                          nWordCount);
    3435    18569600 :             return;
    3436             :         }
    3437             : 
    3438    35435100 :         if (nSrcDataTypeSize == 2 && (nSrcPixelStride % 2) == 0 &&
    3439    20965500 :             (nDstPixelStride % 2) == 0)
    3440             :         {
    3441    20965500 :             GDALFastCopy(static_cast<short *>(pDstData), nDstPixelStride,
    3442             :                          static_cast<const short *>(pSrcData), nSrcPixelStride,
    3443             :                          nWordCount);
    3444    20965100 :             return;
    3445             :         }
    3446             : 
    3447    14469600 :         if (nWordCount == 1)
    3448             :         {
    3449             : #if defined(CSA_BUILD) || defined(__COVERITY__)
    3450             :             // Avoid false positives...
    3451             :             memcpy(pDstData, pSrcData, nSrcDataTypeSize);
    3452             : #else
    3453    14056600 :             if (nSrcDataTypeSize == 2)
    3454           0 :                 memcpy(pDstData, pSrcData, 2);
    3455    14056600 :             else if (nSrcDataTypeSize == 4)
    3456    14014000 :                 memcpy(pDstData, pSrcData, 4);
    3457       42617 :             else if (nSrcDataTypeSize == 8)
    3458       26100 :                 memcpy(pDstData, pSrcData, 8);
    3459             :             else /* if( eSrcType == GDT_CFloat64 ) */
    3460       16517 :                 memcpy(pDstData, pSrcData, 16);
    3461             : #endif
    3462    14056600 :             return;
    3463             :         }
    3464             : 
    3465             :         // Let memcpy() handle the case where we're copying a packed buffer
    3466             :         // of pixels.
    3467      412965 :         if (nSrcPixelStride == nDstPixelStride)
    3468             :         {
    3469      259264 :             if (nSrcPixelStride == nSrcDataTypeSize)
    3470             :             {
    3471      257094 :                 memcpy(pDstData, pSrcData, nWordCount * nSrcDataTypeSize);
    3472      257094 :                 return;
    3473             :             }
    3474             :         }
    3475             :     }
    3476             : 
    3477             :     // Handle the more general case -- deals with conversion of data types
    3478             :     // directly.
    3479    53529500 :     switch (eSrcType)
    3480             :     {
    3481    14923700 :         case GDT_Byte:
    3482    14923700 :             GDALCopyWordsFromT<unsigned char>(
    3483             :                 static_cast<const unsigned char *>(pSrcData), nSrcPixelStride,
    3484             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3485    14941200 :             break;
    3486         976 :         case GDT_Int8:
    3487         976 :             GDALCopyWordsFromT<signed char>(
    3488             :                 static_cast<const signed char *>(pSrcData), nSrcPixelStride,
    3489             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3490         976 :             break;
    3491       53125 :         case GDT_UInt16:
    3492       53125 :             GDALCopyWordsFromT<unsigned short>(
    3493             :                 static_cast<const unsigned short *>(pSrcData), nSrcPixelStride,
    3494             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3495       53125 :             break;
    3496     4543330 :         case GDT_Int16:
    3497     4543330 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3498             :                                       nSrcPixelStride, false, pDstData,
    3499             :                                       eDstType, nDstPixelStride, nWordCount);
    3500     4543320 :             break;
    3501        6747 :         case GDT_UInt32:
    3502        6747 :             GDALCopyWordsFromT<unsigned int>(
    3503             :                 static_cast<const unsigned int *>(pSrcData), nSrcPixelStride,
    3504             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3505        6747 :             break;
    3506    12254600 :         case GDT_Int32:
    3507    12254600 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3508             :                                     nSrcPixelStride, false, pDstData, eDstType,
    3509             :                                     nDstPixelStride, nWordCount);
    3510    12254600 :             break;
    3511        1430 :         case GDT_UInt64:
    3512        1430 :             GDALCopyWordsFromT<std::uint64_t>(
    3513             :                 static_cast<const std::uint64_t *>(pSrcData), nSrcPixelStride,
    3514             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3515        1430 :             break;
    3516        7280 :         case GDT_Int64:
    3517        7280 :             GDALCopyWordsFromT<std::int64_t>(
    3518             :                 static_cast<const std::int64_t *>(pSrcData), nSrcPixelStride,
    3519             :                 false, pDstData, eDstType, nDstPixelStride, nWordCount);
    3520        7280 :             break;
    3521      318785 :         case GDT_Float32:
    3522      318785 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3523             :                                       nSrcPixelStride, false, pDstData,
    3524             :                                       eDstType, nDstPixelStride, nWordCount);
    3525      318779 :             break;
    3526    20678100 :         case GDT_Float64:
    3527    20678100 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3528             :                                        nSrcPixelStride, false, pDstData,
    3529             :                                        eDstType, nDstPixelStride, nWordCount);
    3530    20678200 :             break;
    3531      566961 :         case GDT_CInt16:
    3532      566961 :             GDALCopyWordsFromT<short>(static_cast<const short *>(pSrcData),
    3533             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3534             :                                       nDstPixelStride, nWordCount);
    3535      566961 :             break;
    3536         397 :         case GDT_CInt32:
    3537         397 :             GDALCopyWordsFromT<int>(static_cast<const int *>(pSrcData),
    3538             :                                     nSrcPixelStride, true, pDstData, eDstType,
    3539             :                                     nDstPixelStride, nWordCount);
    3540         397 :             break;
    3541        1357 :         case GDT_CFloat32:
    3542        1357 :             GDALCopyWordsFromT<float>(static_cast<const float *>(pSrcData),
    3543             :                                       nSrcPixelStride, true, pDstData, eDstType,
    3544             :                                       nDstPixelStride, nWordCount);
    3545        1357 :             break;
    3546      172534 :         case GDT_CFloat64:
    3547      172534 :             GDALCopyWordsFromT<double>(static_cast<const double *>(pSrcData),
    3548             :                                        nSrcPixelStride, true, pDstData,
    3549             :                                        eDstType, nDstPixelStride, nWordCount);
    3550      172534 :             break;
    3551           0 :         case GDT_Unknown:
    3552             :         case GDT_TypeCount:
    3553           0 :             CPLAssert(false);
    3554             :     }
    3555             : }
    3556             : 
    3557             : /************************************************************************/
    3558             : /*                            GDALCopyBits()                            */
    3559             : /************************************************************************/
    3560             : 
    3561             : /**
    3562             :  * Bitwise word copying.
    3563             :  *
    3564             :  * A function for moving sets of partial bytes around.  Loosely
    3565             :  * speaking this is a bitwise analog to GDALCopyWords().
    3566             :  *
    3567             :  * It copies nStepCount "words" where each word is nBitCount bits long.
    3568             :  * The nSrcStep and nDstStep are the number of bits from the start of one
    3569             :  * word to the next (same as nBitCount if they are packed).  The nSrcOffset
    3570             :  * and nDstOffset are the offset into the source and destination buffers
    3571             :  * to start at, also measured in bits.
    3572             :  *
    3573             :  * All bit offsets are assumed to start from the high order bit in a byte
    3574             :  * (i.e. most significant bit first).  Currently this function is not very
    3575             :  * optimized, but it may be improved for some common cases in the future
    3576             :  * as needed.
    3577             :  *
    3578             :  * @param pabySrcData the source data buffer.
    3579             :  * @param nSrcOffset the offset (in bits) in pabySrcData to the start of the
    3580             :  * first word to copy.
    3581             :  * @param nSrcStep the offset in bits from the start one source word to the
    3582             :  * start of the next.
    3583             :  * @param pabyDstData the destination data buffer.
    3584             :  * @param nDstOffset the offset (in bits) in pabyDstData to the start of the
    3585             :  * first word to copy over.
    3586             :  * @param nDstStep the offset in bits from the start one word to the
    3587             :  * start of the next.
    3588             :  * @param nBitCount the number of bits in a word to be copied.
    3589             :  * @param nStepCount the number of words to copy.
    3590             :  */
    3591             : 
    3592           0 : void GDALCopyBits(const GByte *pabySrcData, int nSrcOffset, int nSrcStep,
    3593             :                   GByte *pabyDstData, int nDstOffset, int nDstStep,
    3594             :                   int nBitCount, int nStepCount)
    3595             : 
    3596             : {
    3597           0 :     VALIDATE_POINTER0(pabySrcData, "GDALCopyBits");
    3598             : 
    3599           0 :     for (int iStep = 0; iStep < nStepCount; iStep++)
    3600             :     {
    3601           0 :         for (int iBit = 0; iBit < nBitCount; iBit++)
    3602             :         {
    3603           0 :             if (pabySrcData[nSrcOffset >> 3] & (0x80 >> (nSrcOffset & 7)))
    3604           0 :                 pabyDstData[nDstOffset >> 3] |= (0x80 >> (nDstOffset & 7));
    3605             :             else
    3606           0 :                 pabyDstData[nDstOffset >> 3] &= ~(0x80 >> (nDstOffset & 7));
    3607             : 
    3608           0 :             nSrcOffset++;
    3609           0 :             nDstOffset++;
    3610             :         }
    3611             : 
    3612           0 :         nSrcOffset += (nSrcStep - nBitCount);
    3613           0 :         nDstOffset += (nDstStep - nBitCount);
    3614             :     }
    3615             : }
    3616             : 
    3617             : /************************************************************************/
    3618             : /*                    GDALGetBestOverviewLevel()                        */
    3619             : /*                                                                      */
    3620             : /* Returns the best overview level to satisfy the query or -1 if none   */
    3621             : /* Also updates nXOff, nYOff, nXSize, nYSize and psExtraArg when        */
    3622             : /* returning a valid overview level                                     */
    3623             : /************************************************************************/
    3624             : 
    3625           0 : int GDALBandGetBestOverviewLevel(GDALRasterBand *poBand, int &nXOff, int &nYOff,
    3626             :                                  int &nXSize, int &nYSize, int nBufXSize,
    3627             :                                  int nBufYSize)
    3628             : {
    3629           0 :     return GDALBandGetBestOverviewLevel2(poBand, nXOff, nYOff, nXSize, nYSize,
    3630           0 :                                          nBufXSize, nBufYSize, nullptr);
    3631             : }
    3632             : 
    3633      322828 : int GDALBandGetBestOverviewLevel2(GDALRasterBand *poBand, int &nXOff,
    3634             :                                   int &nYOff, int &nXSize, int &nYSize,
    3635             :                                   int nBufXSize, int nBufYSize,
    3636             :                                   GDALRasterIOExtraArg *psExtraArg)
    3637             : {
    3638             :     /* -------------------------------------------------------------------- */
    3639             :     /*      Compute the desired downsampling factor.  It is                 */
    3640             :     /*      based on the least reduced axis, and represents the number      */
    3641             :     /*      of source pixels to one destination pixel.                      */
    3642             :     /* -------------------------------------------------------------------- */
    3643      322828 :     const double dfDesiredDownsamplingFactor =
    3644      322828 :         ((nXSize / static_cast<double>(nBufXSize)) <
    3645      160491 :              (nYSize / static_cast<double>(nBufYSize)) ||
    3646             :          nBufYSize == 1)
    3647      354204 :             ? nXSize / static_cast<double>(nBufXSize)
    3648      129115 :             : nYSize / static_cast<double>(nBufYSize);
    3649             : 
    3650             :     /* -------------------------------------------------------------------- */
    3651             :     /*      Find the overview level that largest downsampling factor (most  */
    3652             :     /*      downsampled) that is still less than (or only a little more)    */
    3653             :     /*      downsampled than the request.                                   */
    3654             :     /* -------------------------------------------------------------------- */
    3655      322828 :     const int nOverviewCount = poBand->GetOverviewCount();
    3656      322828 :     GDALRasterBand *poBestOverview = nullptr;
    3657      322828 :     double dfBestDownsamplingFactor = 0;
    3658      322828 :     int nBestOverviewLevel = -1;
    3659             : 
    3660             :     const char *pszOversampligThreshold =
    3661      322828 :         CPLGetConfigOption("GDAL_OVERVIEW_OVERSAMPLING_THRESHOLD", nullptr);
    3662             : 
    3663             :     // Note: keep this logic for overview selection in sync between
    3664             :     // gdalwarp_lib.cpp and rasterio.cpp
    3665             :     // Cf https://github.com/OSGeo/gdal/pull/9040#issuecomment-1898524693
    3666             :     const double dfOversamplingThreshold =
    3667      645647 :         pszOversampligThreshold ? CPLAtof(pszOversampligThreshold)
    3668      322819 :         : psExtraArg && psExtraArg->eResampleAlg != GRIORA_NearestNeighbour
    3669      645638 :             ? 1.0
    3670      322828 :             : 1.2;
    3671      325519 :     for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    3672             :     {
    3673        5529 :         GDALRasterBand *poOverview = poBand->GetOverview(iOverview);
    3674       11058 :         if (poOverview == nullptr ||
    3675       11057 :             poOverview->GetXSize() > poBand->GetXSize() ||
    3676        5528 :             poOverview->GetYSize() > poBand->GetYSize())
    3677             :         {
    3678           1 :             continue;
    3679             :         }
    3680             : 
    3681             :         // Compute downsampling factor of this overview
    3682             :         const double dfDownsamplingFactor = std::min(
    3683        5528 :             poBand->GetXSize() / static_cast<double>(poOverview->GetXSize()),
    3684       11056 :             poBand->GetYSize() / static_cast<double>(poOverview->GetYSize()));
    3685             : 
    3686             :         // Is it nearly the requested factor and better (lower) than
    3687             :         // the current best factor?
    3688             :         // Use an epsilon because of numerical instability.
    3689        5528 :         constexpr double EPSILON = 1e-1;
    3690        5636 :         if (dfDownsamplingFactor >=
    3691        5528 :                 dfDesiredDownsamplingFactor * dfOversamplingThreshold +
    3692        5420 :                     EPSILON ||
    3693             :             dfDownsamplingFactor <= dfBestDownsamplingFactor)
    3694             :         {
    3695         108 :             continue;
    3696             :         }
    3697             : 
    3698             :         // Ignore AVERAGE_BIT2GRAYSCALE overviews for RasterIO purposes.
    3699        5420 :         const char *pszResampling = poOverview->GetMetadataItem("RESAMPLING");
    3700             : 
    3701        5420 :         if (pszResampling != nullptr &&
    3702          71 :             STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2"))
    3703          16 :             continue;
    3704             : 
    3705             :         // OK, this is our new best overview.
    3706        5404 :         poBestOverview = poOverview;
    3707        5404 :         nBestOverviewLevel = iOverview;
    3708        5404 :         dfBestDownsamplingFactor = dfDownsamplingFactor;
    3709             : 
    3710        5404 :         if (std::abs(dfDesiredDownsamplingFactor - dfDownsamplingFactor) <
    3711             :             EPSILON)
    3712             :         {
    3713        2838 :             break;
    3714             :         }
    3715             :     }
    3716             : 
    3717             :     /* -------------------------------------------------------------------- */
    3718             :     /*      If we didn't find an overview that helps us, just return        */
    3719             :     /*      indicating failure and the full resolution image will be used.  */
    3720             :     /* -------------------------------------------------------------------- */
    3721      322828 :     if (nBestOverviewLevel < 0)
    3722      319922 :         return -1;
    3723             : 
    3724             :     /* -------------------------------------------------------------------- */
    3725             :     /*      Recompute the source window in terms of the selected            */
    3726             :     /*      overview.                                                       */
    3727             :     /* -------------------------------------------------------------------- */
    3728             :     const double dfXFactor =
    3729        2906 :         poBand->GetXSize() / static_cast<double>(poBestOverview->GetXSize());
    3730             :     const double dfYFactor =
    3731        2906 :         poBand->GetYSize() / static_cast<double>(poBestOverview->GetYSize());
    3732        2906 :     CPLDebug("GDAL", "Selecting overview %d x %d", poBestOverview->GetXSize(),
    3733             :              poBestOverview->GetYSize());
    3734             : 
    3735        8718 :     const int nOXOff = std::min(poBestOverview->GetXSize() - 1,
    3736        2906 :                                 static_cast<int>(nXOff / dfXFactor + 0.5));
    3737        8718 :     const int nOYOff = std::min(poBestOverview->GetYSize() - 1,
    3738        2906 :                                 static_cast<int>(nYOff / dfYFactor + 0.5));
    3739        2906 :     int nOXSize = std::max(1, static_cast<int>(nXSize / dfXFactor + 0.5));
    3740        2906 :     int nOYSize = std::max(1, static_cast<int>(nYSize / dfYFactor + 0.5));
    3741        2906 :     if (nOXOff + nOXSize > poBestOverview->GetXSize())
    3742           0 :         nOXSize = poBestOverview->GetXSize() - nOXOff;
    3743        2906 :     if (nOYOff + nOYSize > poBestOverview->GetYSize())
    3744           2 :         nOYSize = poBestOverview->GetYSize() - nOYOff;
    3745             : 
    3746        2906 :     if (psExtraArg)
    3747             :     {
    3748        2906 :         if (psExtraArg->bFloatingPointWindowValidity)
    3749             :         {
    3750          45 :             psExtraArg->dfXOff /= dfXFactor;
    3751          45 :             psExtraArg->dfXSize /= dfXFactor;
    3752          45 :             psExtraArg->dfYOff /= dfYFactor;
    3753          45 :             psExtraArg->dfYSize /= dfYFactor;
    3754             :         }
    3755        2861 :         else if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour)
    3756             :         {
    3757          16 :             psExtraArg->bFloatingPointWindowValidity = true;
    3758          16 :             psExtraArg->dfXOff = nXOff / dfXFactor;
    3759          16 :             psExtraArg->dfXSize = nXSize / dfXFactor;
    3760          16 :             psExtraArg->dfYOff = nYOff / dfYFactor;
    3761          16 :             psExtraArg->dfYSize = nYSize / dfYFactor;
    3762             :         }
    3763             :     }
    3764             : 
    3765        2906 :     nXOff = nOXOff;
    3766        2906 :     nYOff = nOYOff;
    3767        2906 :     nXSize = nOXSize;
    3768        2906 :     nYSize = nOYSize;
    3769             : 
    3770        2906 :     return nBestOverviewLevel;
    3771             : }
    3772             : 
    3773             : /************************************************************************/
    3774             : /*                          OverviewRasterIO()                          */
    3775             : /*                                                                      */
    3776             : /*      Special work function to utilize available overviews to         */
    3777             : /*      more efficiently satisfy downsampled requests.  It will         */
    3778             : /*      return CE_Failure if there are no appropriate overviews         */
    3779             : /*      available but it doesn't emit any error messages.               */
    3780             : /************************************************************************/
    3781             : 
    3782             : //! @cond Doxygen_Suppress
    3783           2 : CPLErr GDALRasterBand::OverviewRasterIO(
    3784             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3785             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3786             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg)
    3787             : 
    3788             : {
    3789             :     GDALRasterIOExtraArg sExtraArg;
    3790           2 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3791             : 
    3792           2 :     const int nOverview = GDALBandGetBestOverviewLevel2(
    3793             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, &sExtraArg);
    3794           2 :     if (nOverview < 0)
    3795           1 :         return CE_Failure;
    3796             : 
    3797             :     /* -------------------------------------------------------------------- */
    3798             :     /*      Recast the call in terms of the new raster layer.               */
    3799             :     /* -------------------------------------------------------------------- */
    3800           1 :     GDALRasterBand *poOverviewBand = GetOverview(nOverview);
    3801           1 :     if (poOverviewBand == nullptr)
    3802           0 :         return CE_Failure;
    3803             : 
    3804           1 :     return poOverviewBand->RasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    3805             :                                     pData, nBufXSize, nBufYSize, eBufType,
    3806           1 :                                     nPixelSpace, nLineSpace, &sExtraArg);
    3807             : }
    3808             : 
    3809             : /************************************************************************/
    3810             : /*                      TryOverviewRasterIO()                           */
    3811             : /************************************************************************/
    3812             : 
    3813      161946 : CPLErr GDALRasterBand::TryOverviewRasterIO(
    3814             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3815             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3816             :     GSpacing nPixelSpace, GSpacing nLineSpace, GDALRasterIOExtraArg *psExtraArg,
    3817             :     int *pbTried)
    3818             : {
    3819      161946 :     int nXOffMod = nXOff;
    3820      161946 :     int nYOffMod = nYOff;
    3821      161946 :     int nXSizeMod = nXSize;
    3822      161946 :     int nYSizeMod = nYSize;
    3823             :     GDALRasterIOExtraArg sExtraArg;
    3824             : 
    3825      161946 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3826             : 
    3827      161946 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    3828             :         this, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize, nBufYSize,
    3829             :         &sExtraArg);
    3830             : 
    3831      161946 :     if (iOvrLevel >= 0)
    3832             :     {
    3833          49 :         GDALRasterBand *poOverviewBand = GetOverview(iOvrLevel);
    3834          49 :         if (poOverviewBand)
    3835             :         {
    3836          49 :             *pbTried = TRUE;
    3837          49 :             return poOverviewBand->RasterIO(
    3838             :                 eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData,
    3839             :                 nBufXSize, nBufYSize, eBufType, nPixelSpace, nLineSpace,
    3840          49 :                 &sExtraArg);
    3841             :         }
    3842             :     }
    3843             : 
    3844      161897 :     *pbTried = FALSE;
    3845      161897 :     return CE_None;
    3846             : }
    3847             : 
    3848             : /************************************************************************/
    3849             : /*                      TryOverviewRasterIO()                           */
    3850             : /************************************************************************/
    3851             : 
    3852      158041 : CPLErr GDALDataset::TryOverviewRasterIO(
    3853             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3854             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    3855             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    3856             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg,
    3857             :     int *pbTried)
    3858             : {
    3859      158041 :     int nXOffMod = nXOff;
    3860      158041 :     int nYOffMod = nYOff;
    3861      158041 :     int nXSizeMod = nXSize;
    3862      158041 :     int nYSizeMod = nYSize;
    3863             :     GDALRasterIOExtraArg sExtraArg;
    3864      158041 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    3865             : 
    3866      316082 :     int iOvrLevel = GDALBandGetBestOverviewLevel2(
    3867      158041 :         papoBands[0], nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, nBufXSize,
    3868             :         nBufYSize, &sExtraArg);
    3869             : 
    3870      158080 :     if (iOvrLevel >= 0 && papoBands[0]->GetOverview(iOvrLevel) != nullptr &&
    3871          39 :         papoBands[0]->GetOverview(iOvrLevel)->GetDataset() != nullptr)
    3872             :     {
    3873          39 :         *pbTried = TRUE;
    3874          39 :         return papoBands[0]->GetOverview(iOvrLevel)->GetDataset()->RasterIO(
    3875             :             eRWFlag, nXOffMod, nYOffMod, nXSizeMod, nYSizeMod, pData, nBufXSize,
    3876             :             nBufYSize, eBufType, nBandCount, panBandMap, nPixelSpace,
    3877          39 :             nLineSpace, nBandSpace, &sExtraArg);
    3878             :     }
    3879             :     else
    3880             :     {
    3881      158002 :         *pbTried = FALSE;
    3882      158002 :         return CE_None;
    3883             :     }
    3884             : }
    3885             : 
    3886             : /************************************************************************/
    3887             : /*                        GetBestOverviewLevel()                        */
    3888             : /*                                                                      */
    3889             : /* Returns the best overview level to satisfy the query or -1 if none   */
    3890             : /* Also updates nXOff, nYOff, nXSize, nYSize when returning a valid     */
    3891             : /* overview level                                                       */
    3892             : /************************************************************************/
    3893             : 
    3894           4 : static int GDALDatasetGetBestOverviewLevel(GDALDataset *poDS, int &nXOff,
    3895             :                                            int &nYOff, int &nXSize, int &nYSize,
    3896             :                                            int nBufXSize, int nBufYSize,
    3897             :                                            int nBandCount,
    3898             :                                            const int *panBandMap,
    3899             :                                            GDALRasterIOExtraArg *psExtraArg)
    3900             : {
    3901           4 :     int nOverviewCount = 0;
    3902           4 :     GDALRasterBand *poFirstBand = nullptr;
    3903             : 
    3904             :     /* -------------------------------------------------------------------- */
    3905             :     /* Check that all bands have the same number of overviews and           */
    3906             :     /* that they have all the same size and block dimensions                */
    3907             :     /* -------------------------------------------------------------------- */
    3908          12 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    3909             :     {
    3910           8 :         GDALRasterBand *poBand = poDS->GetRasterBand(panBandMap[iBand]);
    3911           8 :         if (poBand == nullptr)
    3912           0 :             return -1;
    3913           8 :         if (iBand == 0)
    3914             :         {
    3915           4 :             poFirstBand = poBand;
    3916           4 :             nOverviewCount = poBand->GetOverviewCount();
    3917             :         }
    3918           4 :         else if (nOverviewCount != poBand->GetOverviewCount())
    3919             :         {
    3920           0 :             CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    3921             :                              "mismatched overview count, use std method.");
    3922           0 :             return -1;
    3923             :         }
    3924             :         else
    3925             :         {
    3926           4 :             for (int iOverview = 0; iOverview < nOverviewCount; iOverview++)
    3927             :             {
    3928           0 :                 GDALRasterBand *poOvrBand = poBand->GetOverview(iOverview);
    3929             :                 GDALRasterBand *poOvrFirstBand =
    3930           0 :                     poFirstBand->GetOverview(iOverview);
    3931           0 :                 if (poOvrBand == nullptr || poOvrFirstBand == nullptr)
    3932           0 :                     continue;
    3933             : 
    3934           0 :                 if (poOvrFirstBand->GetXSize() != poOvrBand->GetXSize() ||
    3935           0 :                     poOvrFirstBand->GetYSize() != poOvrBand->GetYSize())
    3936             :                 {
    3937           0 :                     CPLDebug("GDAL",
    3938             :                              "GDALDataset::GetBestOverviewLevel() ... "
    3939             :                              "mismatched overview sizes, use std method.");
    3940           0 :                     return -1;
    3941             :                 }
    3942           0 :                 int nBlockXSizeFirst = 0;
    3943           0 :                 int nBlockYSizeFirst = 0;
    3944           0 :                 poOvrFirstBand->GetBlockSize(&nBlockXSizeFirst,
    3945             :                                              &nBlockYSizeFirst);
    3946             : 
    3947           0 :                 int nBlockXSizeCurrent = 0;
    3948           0 :                 int nBlockYSizeCurrent = 0;
    3949           0 :                 poOvrBand->GetBlockSize(&nBlockXSizeCurrent,
    3950             :                                         &nBlockYSizeCurrent);
    3951             : 
    3952           0 :                 if (nBlockXSizeFirst != nBlockXSizeCurrent ||
    3953           0 :                     nBlockYSizeFirst != nBlockYSizeCurrent)
    3954             :                 {
    3955           0 :                     CPLDebug("GDAL", "GDALDataset::GetBestOverviewLevel() ... "
    3956             :                                      "mismatched block sizes, use std method.");
    3957           0 :                     return -1;
    3958             :                 }
    3959             :             }
    3960             :         }
    3961             :     }
    3962           4 :     if (poFirstBand == nullptr)
    3963           0 :         return -1;
    3964             : 
    3965           4 :     return GDALBandGetBestOverviewLevel2(poFirstBand, nXOff, nYOff, nXSize,
    3966             :                                          nYSize, nBufXSize, nBufYSize,
    3967           4 :                                          psExtraArg);
    3968             : }
    3969             : 
    3970             : /************************************************************************/
    3971             : /*                         BlockBasedRasterIO()                         */
    3972             : /*                                                                      */
    3973             : /*      This convenience function implements a dataset level            */
    3974             : /*      RasterIO() interface based on calling down to fetch blocks,     */
    3975             : /*      much like the GDALRasterBand::IRasterIO(), but it handles       */
    3976             : /*      all bands at once, so that a format driver that handles a       */
    3977             : /*      request for different bands of the same block efficiently       */
    3978             : /*      (i.e. without re-reading interleaved data) will efficiently.    */
    3979             : /*                                                                      */
    3980             : /*      This method is intended to be called by an overridden           */
    3981             : /*      IRasterIO() method in the driver specific GDALDataset           */
    3982             : /*      derived class.                                                  */
    3983             : /*                                                                      */
    3984             : /*      Default internal implementation of RasterIO() ... utilizes      */
    3985             : /*      the Block access methods to satisfy the request.  This would    */
    3986             : /*      normally only be overridden by formats with overviews.          */
    3987             : /*                                                                      */
    3988             : /*      To keep things relatively simple, this method does not          */
    3989             : /*      currently take advantage of some special cases addressed in     */
    3990             : /*      GDALRasterBand::IRasterIO(), so it is likely best to only       */
    3991             : /*      call it when you know it will help.  That is in cases where     */
    3992             : /*      data is at 1:1 to the buffer, and you know the driver is        */
    3993             : /*      implementing interleaved IO efficiently on a block by block     */
    3994             : /*      basis. Overviews will be used when possible.                    */
    3995             : /************************************************************************/
    3996             : 
    3997       63624 : CPLErr GDALDataset::BlockBasedRasterIO(
    3998             :     GDALRWFlag eRWFlag, int nXOff, int nYOff, int nXSize, int nYSize,
    3999             :     void *pData, int nBufXSize, int nBufYSize, GDALDataType eBufType,
    4000             :     int nBandCount, const int *panBandMap, GSpacing nPixelSpace,
    4001             :     GSpacing nLineSpace, GSpacing nBandSpace, GDALRasterIOExtraArg *psExtraArg)
    4002             : 
    4003             : {
    4004       63624 :     CPLAssert(nullptr != pData);
    4005             : 
    4006       63624 :     GByte **papabySrcBlock = nullptr;
    4007       63624 :     GDALRasterBlock *poBlock = nullptr;
    4008       63624 :     GDALRasterBlock **papoBlocks = nullptr;
    4009       63624 :     int nLBlockX = -1;
    4010       63624 :     int nLBlockY = -1;
    4011             :     int iBufYOff;
    4012             :     int iBufXOff;
    4013       63624 :     int nBlockXSize = 1;
    4014       63624 :     int nBlockYSize = 1;
    4015       63624 :     CPLErr eErr = CE_None;
    4016       63624 :     GDALDataType eDataType = GDT_Byte;
    4017             : 
    4018       63624 :     const bool bUseIntegerRequestCoords =
    4019       64057 :         (!psExtraArg->bFloatingPointWindowValidity ||
    4020         433 :          (nXOff == psExtraArg->dfXOff && nYOff == psExtraArg->dfYOff &&
    4021         431 :           nXSize == psExtraArg->dfXSize && nYSize == psExtraArg->dfYSize));
    4022             : 
    4023             :     /* -------------------------------------------------------------------- */
    4024             :     /*      Ensure that all bands share a common block size and data type.  */
    4025             :     /* -------------------------------------------------------------------- */
    4026      301197 :     for (int iBand = 0; iBand < nBandCount; iBand++)
    4027             :     {
    4028      237574 :         GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4029             : 
    4030      237573 :         if (iBand == 0)
    4031             :         {
    4032       63622 :             poBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4033       63622 :             eDataType = poBand->GetRasterDataType();
    4034             :         }
    4035             :         else
    4036             :         {
    4037      173951 :             int nThisBlockXSize = 0;
    4038      173951 :             int nThisBlockYSize = 0;
    4039      173951 :             poBand->GetBlockSize(&nThisBlockXSize, &nThisBlockYSize);
    4040      173951 :             if (nThisBlockXSize != nBlockXSize ||
    4041      173951 :                 nThisBlockYSize != nBlockYSize)
    4042             :             {
    4043           1 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4044             :                                  "mismatched block sizes, use std method.");
    4045           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4046             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4047             :                                          nBandCount, panBandMap, nPixelSpace,
    4048           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4049             :             }
    4050             : 
    4051      173950 :             if (eDataType != poBand->GetRasterDataType() &&
    4052           0 :                 (nXSize != nBufXSize || nYSize != nBufYSize))
    4053             :             {
    4054           1 :                 CPLDebug("GDAL", "GDALDataset::BlockBasedRasterIO() ... "
    4055             :                                  "mismatched band data types, use std method.");
    4056           0 :                 return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize,
    4057             :                                          pData, nBufXSize, nBufYSize, eBufType,
    4058             :                                          nBandCount, panBandMap, nPixelSpace,
    4059           0 :                                          nLineSpace, nBandSpace, psExtraArg);
    4060             :             }
    4061             :         }
    4062             :     }
    4063             : 
    4064             :     /* ==================================================================== */
    4065             :     /*      In this special case at full resolution we step through in      */
    4066             :     /*      blocks, turning the request over to the per-band                */
    4067             :     /*      IRasterIO(), but ensuring that all bands of one block are       */
    4068             :     /*      called before proceeding to the next.                           */
    4069             :     /* ==================================================================== */
    4070             : 
    4071       63623 :     if (nXSize == nBufXSize && nYSize == nBufYSize && bUseIntegerRequestCoords)
    4072             :     {
    4073             :         GDALRasterIOExtraArg sDummyExtraArg;
    4074       63619 :         INIT_RASTERIO_EXTRA_ARG(sDummyExtraArg);
    4075             : 
    4076       63619 :         int nChunkYSize = 0;
    4077       63619 :         int nChunkXSize = 0;
    4078             : 
    4079      220077 :         for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff += nChunkYSize)
    4080             :         {
    4081      157487 :             const int nChunkYOff = iBufYOff + nYOff;
    4082      157487 :             nChunkYSize = nBlockYSize - (nChunkYOff % nBlockYSize);
    4083      157487 :             if (nChunkYOff + nChunkYSize > nYOff + nYSize)
    4084       58874 :                 nChunkYSize = (nYOff + nYSize) - nChunkYOff;
    4085             : 
    4086      838808 :             for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff += nChunkXSize)
    4087             :             {
    4088      682347 :                 const int nChunkXOff = iBufXOff + nXOff;
    4089      682347 :                 nChunkXSize = nBlockXSize - (nChunkXOff % nBlockXSize);
    4090      682347 :                 if (nChunkXOff + nChunkXSize > nXOff + nXSize)
    4091       74878 :                     nChunkXSize = (nXOff + nXSize) - nChunkXOff;
    4092             : 
    4093      682347 :                 GByte *pabyChunkData =
    4094      682347 :                     static_cast<GByte *>(pData) + iBufXOff * nPixelSpace +
    4095      682347 :                     static_cast<GPtrDiff_t>(iBufYOff) * nLineSpace;
    4096             : 
    4097     3315810 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4098             :                 {
    4099     2634490 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4100             : 
    4101     5268930 :                     eErr = poBand->IRasterIO(
    4102             :                         eRWFlag, nChunkXOff, nChunkYOff, nChunkXSize,
    4103             :                         nChunkYSize,
    4104     2634440 :                         pabyChunkData +
    4105     2634440 :                             static_cast<GPtrDiff_t>(iBand) * nBandSpace,
    4106             :                         nChunkXSize, nChunkYSize, eBufType, nPixelSpace,
    4107     2634440 :                         nLineSpace, &sDummyExtraArg);
    4108     2634490 :                     if (eErr != CE_None)
    4109        1025 :                         return eErr;
    4110             :                 }
    4111             :             }
    4112             : 
    4113      176708 :             if (psExtraArg->pfnProgress != nullptr &&
    4114       20247 :                 !psExtraArg->pfnProgress(
    4115      176708 :                     1.0 * std::min(nBufYSize, iBufYOff + nChunkYSize) /
    4116             :                         nBufYSize,
    4117             :                     "", psExtraArg->pProgressData))
    4118             :             {
    4119          11 :                 return CE_Failure;
    4120             :             }
    4121             :         }
    4122             : 
    4123       62590 :         return CE_None;
    4124             :     }
    4125             : 
    4126             :     /* Below code is not compatible with that case. It would need a complete */
    4127             :     /* separate code like done in GDALRasterBand::IRasterIO. */
    4128           4 :     if (eRWFlag == GF_Write && (nBufXSize < nXSize || nBufYSize < nYSize))
    4129             :     {
    4130           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4131             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4132             :                                  panBandMap, nPixelSpace, nLineSpace,
    4133           0 :                                  nBandSpace, psExtraArg);
    4134             :     }
    4135             : 
    4136             :     /* We could have a smarter implementation, but that will do for now */
    4137           4 :     if (psExtraArg->eResampleAlg != GRIORA_NearestNeighbour &&
    4138           0 :         (nBufXSize != nXSize || nBufYSize != nYSize))
    4139             :     {
    4140           0 :         return BandBasedRasterIO(eRWFlag, nXOff, nYOff, nXSize, nYSize, pData,
    4141             :                                  nBufXSize, nBufYSize, eBufType, nBandCount,
    4142             :                                  panBandMap, nPixelSpace, nLineSpace,
    4143           0 :                                  nBandSpace, psExtraArg);
    4144             :     }
    4145             : 
    4146             :     /* ==================================================================== */
    4147             :     /*      Loop reading required source blocks to satisfy output           */
    4148             :     /*      request.  This is the most general implementation.              */
    4149             :     /* ==================================================================== */
    4150             : 
    4151           4 :     const int nBandDataSize = GDALGetDataTypeSizeBytes(eDataType);
    4152             : 
    4153             :     papabySrcBlock =
    4154           4 :         static_cast<GByte **>(CPLCalloc(sizeof(GByte *), nBandCount));
    4155             :     papoBlocks =
    4156           4 :         static_cast<GDALRasterBlock **>(CPLCalloc(sizeof(void *), nBandCount));
    4157             : 
    4158             :     /* -------------------------------------------------------------------- */
    4159             :     /*      Select an overview level if appropriate.                        */
    4160             :     /* -------------------------------------------------------------------- */
    4161             : 
    4162             :     GDALRasterIOExtraArg sExtraArg;
    4163           4 :     GDALCopyRasterIOExtraArg(&sExtraArg, psExtraArg);
    4164           4 :     const int nOverviewLevel = GDALDatasetGetBestOverviewLevel(
    4165             :         this, nXOff, nYOff, nXSize, nYSize, nBufXSize, nBufYSize, nBandCount,
    4166             :         panBandMap, &sExtraArg);
    4167           4 :     if (nOverviewLevel >= 0)
    4168             :     {
    4169           2 :         GetRasterBand(panBandMap[0])
    4170           2 :             ->GetOverview(nOverviewLevel)
    4171           2 :             ->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4172             :     }
    4173             : 
    4174           4 :     double dfXOff = nXOff;
    4175           4 :     double dfYOff = nYOff;
    4176           4 :     double dfXSize = nXSize;
    4177           4 :     double dfYSize = nYSize;
    4178           4 :     if (sExtraArg.bFloatingPointWindowValidity)
    4179             :     {
    4180           2 :         dfXOff = sExtraArg.dfXOff;
    4181           2 :         dfYOff = sExtraArg.dfYOff;
    4182           2 :         dfXSize = sExtraArg.dfXSize;
    4183           2 :         dfYSize = sExtraArg.dfYSize;
    4184             :     }
    4185             : 
    4186             :     /* -------------------------------------------------------------------- */
    4187             :     /*      Compute stepping increment.                                     */
    4188             :     /* -------------------------------------------------------------------- */
    4189           4 :     const double dfSrcXInc = dfXSize / static_cast<double>(nBufXSize);
    4190           4 :     const double dfSrcYInc = dfYSize / static_cast<double>(nBufYSize);
    4191             : 
    4192           4 :     constexpr double EPS = 1e-10;
    4193             :     /* -------------------------------------------------------------------- */
    4194             :     /*      Loop over buffer computing source locations.                    */
    4195             :     /* -------------------------------------------------------------------- */
    4196          36 :     for (iBufYOff = 0; iBufYOff < nBufYSize; iBufYOff++)
    4197             :     {
    4198             :         GPtrDiff_t iSrcOffset;
    4199             : 
    4200             :         // Add small epsilon to avoid some numeric precision issues.
    4201          32 :         const double dfSrcY = (iBufYOff + 0.5) * dfSrcYInc + dfYOff + EPS;
    4202          32 :         const int iSrcY = static_cast<int>(std::min(
    4203          32 :             std::max(0.0, dfSrcY), static_cast<double>(nRasterYSize - 1)));
    4204             : 
    4205          32 :         GPtrDiff_t iBufOffset = static_cast<GPtrDiff_t>(iBufYOff) *
    4206             :                                 static_cast<GPtrDiff_t>(nLineSpace);
    4207             : 
    4208         302 :         for (iBufXOff = 0; iBufXOff < nBufXSize; iBufXOff++)
    4209             :         {
    4210         270 :             const double dfSrcX = (iBufXOff + 0.5) * dfSrcXInc + dfXOff + EPS;
    4211         270 :             const int iSrcX = static_cast<int>(std::min(
    4212         270 :                 std::max(0.0, dfSrcX), static_cast<double>(nRasterXSize - 1)));
    4213             : 
    4214             :             // FIXME: this code likely doesn't work if the dirty block gets
    4215             :             // flushed to disk before being completely written. In the meantime,
    4216             :             // bJustInitialize should probably be set to FALSE even if it is not
    4217             :             // ideal performance wise, and for lossy compression
    4218             : 
    4219             :             /* --------------------------------------------------------------------
    4220             :              */
    4221             :             /*      Ensure we have the appropriate block loaded. */
    4222             :             /* --------------------------------------------------------------------
    4223             :              */
    4224         270 :             if (iSrcX < nLBlockX * nBlockXSize ||
    4225         270 :                 iSrcX - nBlockXSize >= nLBlockX * nBlockXSize ||
    4226         266 :                 iSrcY < nLBlockY * nBlockYSize ||
    4227         266 :                 iSrcY - nBlockYSize >= nLBlockY * nBlockYSize)
    4228             :             {
    4229           4 :                 nLBlockX = iSrcX / nBlockXSize;
    4230           4 :                 nLBlockY = iSrcY / nBlockYSize;
    4231             : 
    4232           4 :                 const bool bJustInitialize =
    4233           0 :                     eRWFlag == GF_Write && nYOff <= nLBlockY * nBlockYSize &&
    4234           0 :                     nYOff + nYSize - nBlockYSize >= nLBlockY * nBlockYSize &&
    4235           4 :                     nXOff <= nLBlockX * nBlockXSize &&
    4236           0 :                     nXOff + nXSize - nBlockXSize >= nLBlockX * nBlockXSize;
    4237             :                 /*bool bMemZeroBuffer = FALSE;
    4238             :                 if( eRWFlag == GF_Write && !bJustInitialize &&
    4239             :                     nXOff <= nLBlockX * nBlockXSize &&
    4240             :                     nYOff <= nLBlockY * nBlockYSize &&
    4241             :                     (nXOff + nXSize >= (nLBlockX+1) * nBlockXSize ||
    4242             :                      (nXOff + nXSize == GetRasterXSize() &&
    4243             :                      (nLBlockX+1) * nBlockXSize > GetRasterXSize())) &&
    4244             :                     (nYOff + nYSize >= (nLBlockY+1) * nBlockYSize ||
    4245             :                      (nYOff + nYSize == GetRasterYSize() &&
    4246             :                      (nLBlockY+1) * nBlockYSize > GetRasterYSize())) )
    4247             :                 {
    4248             :                     bJustInitialize = TRUE;
    4249             :                     bMemZeroBuffer = TRUE;
    4250             :                 }*/
    4251          12 :                 for (int iBand = 0; iBand < nBandCount; iBand++)
    4252             :                 {
    4253           8 :                     GDALRasterBand *poBand = GetRasterBand(panBandMap[iBand]);
    4254           8 :                     if (nOverviewLevel >= 0)
    4255           2 :                         poBand = poBand->GetOverview(nOverviewLevel);
    4256          16 :                     poBlock = poBand->GetLockedBlockRef(nLBlockX, nLBlockY,
    4257           8 :                                                         bJustInitialize);
    4258           8 :                     if (poBlock == nullptr)
    4259             :                     {
    4260           0 :                         eErr = CE_Failure;
    4261           0 :                         goto CleanupAndReturn;
    4262             :                     }
    4263             : 
    4264           8 :                     if (eRWFlag == GF_Write)
    4265           0 :                         poBlock->MarkDirty();
    4266             : 
    4267           8 :                     if (papoBlocks[iBand] != nullptr)
    4268           0 :                         papoBlocks[iBand]->DropLock();
    4269             : 
    4270           8 :                     papoBlocks[iBand] = poBlock;
    4271             : 
    4272           8 :                     papabySrcBlock[iBand] =
    4273           8 :                         static_cast<GByte *>(poBlock->GetDataRef());
    4274             :                     /*if( bMemZeroBuffer )
    4275             :                     {
    4276             :                         memset(papabySrcBlock[iBand], 0,
    4277             :                             static_cast<GPtrDiff_t>(nBandDataSize) * nBlockXSize
    4278             :                     * nBlockYSize);
    4279             :                     }*/
    4280             :                 }
    4281             :             }
    4282             : 
    4283             :             /* --------------------------------------------------------------------
    4284             :              */
    4285             :             /*      Copy over this pixel of data. */
    4286             :             /* --------------------------------------------------------------------
    4287             :              */
    4288         270 :             iSrcOffset = (static_cast<GPtrDiff_t>(iSrcX) -
    4289         270 :                           static_cast<GPtrDiff_t>(nLBlockX) * nBlockXSize +
    4290         270 :                           (static_cast<GPtrDiff_t>(iSrcY) -
    4291         270 :                            static_cast<GPtrDiff_t>(nLBlockY) * nBlockYSize) *
    4292         270 :                               nBlockXSize) *
    4293         270 :                          nBandDataSize;
    4294             : 
    4295         980 :             for (int iBand = 0; iBand < nBandCount; iBand++)
    4296             :             {
    4297         710 :                 GByte *pabySrcBlock = papabySrcBlock[iBand];
    4298         710 :                 GPtrDiff_t iBandBufOffset =
    4299         710 :                     iBufOffset + static_cast<GPtrDiff_t>(iBand) *
    4300             :                                      static_cast<GPtrDiff_t>(nBandSpace);
    4301             : 
    4302         710 :                 if (eDataType == eBufType)
    4303             :                 {
    4304         710 :                     if (eRWFlag == GF_Read)
    4305         710 :                         memcpy(static_cast<GByte *>(pData) + iBandBufOffset,
    4306         710 :                                pabySrcBlock + iSrcOffset, nBandDataSize);
    4307             :                     else
    4308           0 :                         memcpy(pabySrcBlock + iSrcOffset,
    4309             :                                static_cast<const GByte *>(pData) +
    4310           0 :                                    iBandBufOffset,
    4311             :                                nBandDataSize);
    4312             :                 }
    4313             :                 else
    4314             :                 {
    4315             :                     /* type to type conversion ... ouch, this is expensive way
    4316             :                        of handling single words */
    4317             : 
    4318           0 :                     if (eRWFlag == GF_Read)
    4319           0 :                         GDALCopyWords64(pabySrcBlock + iSrcOffset, eDataType, 0,
    4320             :                                         static_cast<GByte *>(pData) +
    4321           0 :                                             iBandBufOffset,
    4322             :                                         eBufType, 0, 1);
    4323             :                     else
    4324           0 :                         GDALCopyWords64(static_cast<const GByte *>(pData) +
    4325           0 :                                             iBandBufOffset,
    4326           0 :                                         eBufType, 0, pabySrcBlock + iSrcOffset,
    4327             :                                         eDataType, 0, 1);
    4328             :                 }
    4329             :             }
    4330             : 
    4331         270 :             iBufOffset += static_cast<int>(nPixelSpace);
    4332             :         }
    4333             :     }
    4334             : 
    4335             :     /* -------------------------------------------------------------------- */
    4336             :     /*      CleanupAndReturn.                                               */
    4337             :     /* -------------------------------------------------------------------- */
    4338           4 : CleanupAndReturn:
    4339           4 :     CPLFree(papabySrcBlock);
    4340           4 :     if (papoBlocks != nullptr)
    4341             :     {
    4342          12 :         for (int iBand = 0; iBand < nBandCount; iBand++)
    4343             :         {
    4344           8 :             if (papoBlocks[iBand] != nullptr)
    4345           8 :                 papoBlocks[iBand]->DropLock();
    4346             :         }
    4347           4 :         CPLFree(papoBlocks);
    4348             :     }
    4349             : 
    4350           4 :     return eErr;
    4351             : }
    4352             : 
    4353             : //! @endcond
    4354             : 
    4355             : /************************************************************************/
    4356             : /*                  GDALCopyWholeRasterGetSwathSize()                   */
    4357             : /************************************************************************/
    4358             : 
    4359        2895 : static void GDALCopyWholeRasterGetSwathSize(GDALRasterBand *poSrcPrototypeBand,
    4360             :                                             GDALRasterBand *poDstPrototypeBand,
    4361             :                                             int nBandCount,
    4362             :                                             int bDstIsCompressed,
    4363             :                                             int bInterleave, int *pnSwathCols,
    4364             :                                             int *pnSwathLines)
    4365             : {
    4366        2895 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4367        2895 :     int nSrcBlockXSize = 0;
    4368        2895 :     int nSrcBlockYSize = 0;
    4369        2895 :     int nBlockXSize = 0;
    4370        2895 :     int nBlockYSize = 0;
    4371             : 
    4372        2895 :     int nXSize = poSrcPrototypeBand->GetXSize();
    4373        2895 :     int nYSize = poSrcPrototypeBand->GetYSize();
    4374             : 
    4375        2895 :     poSrcPrototypeBand->GetBlockSize(&nSrcBlockXSize, &nSrcBlockYSize);
    4376        2895 :     poDstPrototypeBand->GetBlockSize(&nBlockXSize, &nBlockYSize);
    4377             : 
    4378        2895 :     const int nMaxBlockXSize = std::max(nBlockXSize, nSrcBlockXSize);
    4379        2895 :     const int nMaxBlockYSize = std::max(nBlockYSize, nSrcBlockYSize);
    4380             : 
    4381        2895 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4382        2895 :     if (bInterleave)
    4383        1340 :         nPixelSize *= nBandCount;
    4384             : 
    4385             :     // aim for one row of blocks.  Do not settle for less.
    4386        2895 :     int nSwathCols = nXSize;
    4387        2895 :     int nSwathLines = nMaxBlockYSize;
    4388             : 
    4389             :     const char *pszSrcCompression =
    4390        2895 :         poSrcPrototypeBand->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4391        2895 :     if (pszSrcCompression == nullptr)
    4392             :     {
    4393        2869 :         auto poSrcDS = poSrcPrototypeBand->GetDataset();
    4394        2869 :         if (poSrcDS)
    4395             :             pszSrcCompression =
    4396        2863 :                 poSrcDS->GetMetadataItem("COMPRESSION", "IMAGE_STRUCTURE");
    4397             :     }
    4398             : 
    4399             :     /* -------------------------------------------------------------------- */
    4400             :     /*      What will our swath size be?                                    */
    4401             :     /* -------------------------------------------------------------------- */
    4402             :     // When writing interleaved data in a compressed format, we want to be sure
    4403             :     // that each block will only be written once, so the swath size must not be
    4404             :     // greater than the block cache.
    4405        2895 :     const char *pszSwathSize = CPLGetConfigOption("GDAL_SWATH_SIZE", nullptr);
    4406             :     int nTargetSwathSize;
    4407        2895 :     if (pszSwathSize != nullptr)
    4408           0 :         nTargetSwathSize = static_cast<int>(
    4409           0 :             std::min(GIntBig(INT_MAX), CPLAtoGIntBig(pszSwathSize)));
    4410             :     else
    4411             :     {
    4412             :         // As a default, take one 1/4 of the cache size.
    4413        2895 :         nTargetSwathSize = static_cast<int>(
    4414        2895 :             std::min(GIntBig(INT_MAX), GDALGetCacheMax64() / 4));
    4415             : 
    4416             :         // but if the minimum idal swath buf size is less, then go for it to
    4417             :         // avoid unnecessarily abusing RAM usage.
    4418             :         // but try to use 10 MB at least.
    4419        2895 :         GIntBig nIdealSwathBufSize =
    4420        2895 :             static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize;
    4421        2895 :         int nMinTargetSwathSize = 10 * 1000 * 1000;
    4422             : 
    4423        2895 :         if ((poSrcPrototypeBand->GetSuggestedBlockAccessPattern() &
    4424        2895 :              GSBAP_LARGEST_CHUNK_POSSIBLE) != 0)
    4425             :         {
    4426           2 :             nMinTargetSwathSize = nTargetSwathSize;
    4427             :         }
    4428             : 
    4429        2895 :         if (nIdealSwathBufSize < nTargetSwathSize &&
    4430        2885 :             nIdealSwathBufSize < nMinTargetSwathSize)
    4431             :         {
    4432        2882 :             nIdealSwathBufSize = nMinTargetSwathSize;
    4433             :         }
    4434             : 
    4435        2895 :         if (pszSrcCompression != nullptr &&
    4436         156 :             EQUAL(pszSrcCompression, "JPEG2000") &&
    4437           0 :             (!bDstIsCompressed || ((nSrcBlockXSize % nBlockXSize) == 0 &&
    4438           0 :                                    (nSrcBlockYSize % nBlockYSize) == 0)))
    4439             :         {
    4440           2 :             nIdealSwathBufSize =
    4441           4 :                 std::max(nIdealSwathBufSize, static_cast<GIntBig>(nSwathCols) *
    4442           2 :                                                  nSrcBlockYSize * nPixelSize);
    4443             :         }
    4444        2895 :         if (nTargetSwathSize > nIdealSwathBufSize)
    4445        2881 :             nTargetSwathSize = static_cast<int>(
    4446        2881 :                 std::min(GIntBig(INT_MAX), nIdealSwathBufSize));
    4447             :     }
    4448             : 
    4449        2895 :     if (nTargetSwathSize < 1000000)
    4450           8 :         nTargetSwathSize = 1000000;
    4451             : 
    4452             :     /* But let's check that  */
    4453        3103 :     if (bDstIsCompressed && bInterleave &&
    4454         208 :         nTargetSwathSize > GDALGetCacheMax64())
    4455             :     {
    4456           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4457             :                  "When translating into a compressed interleave format, "
    4458             :                  "the block cache size (" CPL_FRMT_GIB ") "
    4459             :                  "should be at least the size of the swath (%d) "
    4460             :                  "(GDAL_SWATH_SIZE config. option)",
    4461             :                  GDALGetCacheMax64(), nTargetSwathSize);
    4462             :     }
    4463             : 
    4464             : #define IS_DIVIDER_OF(x, y) ((y) % (x) == 0)
    4465             : #define ROUND_TO(x, y) (((x) / (y)) * (y))
    4466             : 
    4467             :     // if both input and output datasets are tiled, that the tile dimensions
    4468             :     // are "compatible", try to stick  to a swath dimension that is a multiple
    4469             :     // of input and output block dimensions.
    4470        2895 :     if (nBlockXSize != nXSize && nSrcBlockXSize != nXSize &&
    4471          34 :         IS_DIVIDER_OF(nBlockXSize, nMaxBlockXSize) &&
    4472          34 :         IS_DIVIDER_OF(nSrcBlockXSize, nMaxBlockXSize) &&
    4473          34 :         IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4474          34 :         IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4475             :     {
    4476          34 :         if (static_cast<GIntBig>(nMaxBlockXSize) * nMaxBlockYSize *
    4477          34 :                 nPixelSize <=
    4478          34 :             static_cast<GIntBig>(nTargetSwathSize))
    4479             :         {
    4480          34 :             nSwathCols = nTargetSwathSize / (nMaxBlockYSize * nPixelSize);
    4481          34 :             nSwathCols = ROUND_TO(nSwathCols, nMaxBlockXSize);
    4482          34 :             if (nSwathCols == 0)
    4483           0 :                 nSwathCols = nMaxBlockXSize;
    4484          34 :             if (nSwathCols > nXSize)
    4485          32 :                 nSwathCols = nXSize;
    4486          34 :             nSwathLines = nMaxBlockYSize;
    4487             : 
    4488          34 :             if (static_cast<GIntBig>(nSwathCols) * nSwathLines * nPixelSize >
    4489          34 :                 static_cast<GIntBig>(nTargetSwathSize))
    4490             :             {
    4491           0 :                 nSwathCols = nXSize;
    4492           0 :                 nSwathLines = nBlockYSize;
    4493             :             }
    4494             :         }
    4495             :     }
    4496             : 
    4497        2895 :     const GIntBig nMemoryPerCol = static_cast<GIntBig>(nSwathCols) * nPixelSize;
    4498        2895 :     const GIntBig nSwathBufSize = nMemoryPerCol * nSwathLines;
    4499        2895 :     if (nSwathBufSize > static_cast<GIntBig>(nTargetSwathSize))
    4500             :     {
    4501           1 :         nSwathLines = static_cast<int>(nTargetSwathSize / nMemoryPerCol);
    4502           1 :         if (nSwathLines == 0)
    4503           1 :             nSwathLines = 1;
    4504             : 
    4505           1 :         CPLDebug(
    4506             :             "GDAL",
    4507             :             "GDALCopyWholeRasterGetSwathSize(): adjusting to %d line swath "
    4508             :             "since requirement (" CPL_FRMT_GIB " bytes) exceed target swath "
    4509             :             "size (%d bytes) (GDAL_SWATH_SIZE config. option)",
    4510           1 :             nSwathLines, nBlockYSize * nMemoryPerCol, nTargetSwathSize);
    4511             :     }
    4512             :     // If we are processing single scans, try to handle several at once.
    4513             :     // If we are handling swaths already, only grow the swath if a row
    4514             :     // of blocks is substantially less than our target buffer size.
    4515        2894 :     else if (nSwathLines == 1 ||
    4516        2393 :              nMemoryPerCol * nSwathLines <
    4517        2393 :                  static_cast<GIntBig>(nTargetSwathSize) / 10)
    4518             :     {
    4519        2867 :         nSwathLines = std::min(
    4520             :             nYSize,
    4521        2867 :             std::max(1, static_cast<int>(nTargetSwathSize / nMemoryPerCol)));
    4522             : 
    4523             :         /* If possible try to align to source and target block height */
    4524        2867 :         if ((nSwathLines % nMaxBlockYSize) != 0 &&
    4525         979 :             nSwathLines > nMaxBlockYSize &&
    4526         979 :             IS_DIVIDER_OF(nBlockYSize, nMaxBlockYSize) &&
    4527         951 :             IS_DIVIDER_OF(nSrcBlockYSize, nMaxBlockYSize))
    4528         169 :             nSwathLines = ROUND_TO(nSwathLines, nMaxBlockYSize);
    4529             :     }
    4530             : 
    4531        2895 :     if (pszSrcCompression != nullptr && EQUAL(pszSrcCompression, "JPEG2000") &&
    4532           0 :         (!bDstIsCompressed || (IS_DIVIDER_OF(nBlockXSize, nSrcBlockXSize) &&
    4533           0 :                                IS_DIVIDER_OF(nBlockYSize, nSrcBlockYSize))))
    4534             :     {
    4535             :         // Typical use case: converting from Pleaiades that is 2048x2048 tiled.
    4536           2 :         if (nSwathLines < nSrcBlockYSize)
    4537             :         {
    4538           0 :             nSwathLines = nSrcBlockYSize;
    4539             : 
    4540             :             // Number of pixels that can be read/write simultaneously.
    4541           0 :             nSwathCols = nTargetSwathSize / (nSrcBlockXSize * nPixelSize);
    4542           0 :             nSwathCols = ROUND_TO(nSwathCols, nSrcBlockXSize);
    4543           0 :             if (nSwathCols == 0)
    4544           0 :                 nSwathCols = nSrcBlockXSize;
    4545           0 :             if (nSwathCols > nXSize)
    4546           0 :                 nSwathCols = nXSize;
    4547             : 
    4548           0 :             CPLDebug(
    4549             :                 "GDAL",
    4550             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    4551             :                 "too high block, "
    4552             :                 "use partial width at one time");
    4553             :         }
    4554           2 :         else if ((nSwathLines % nSrcBlockYSize) != 0)
    4555             :         {
    4556             :             /* Round on a multiple of nSrcBlockYSize */
    4557           0 :             nSwathLines = ROUND_TO(nSwathLines, nSrcBlockYSize);
    4558           0 :             CPLDebug(
    4559             :                 "GDAL",
    4560             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    4561             :                 "round nSwathLines to block height : %d",
    4562             :                 nSwathLines);
    4563             :         }
    4564             :     }
    4565        2893 :     else if (bDstIsCompressed)
    4566             :     {
    4567         366 :         if (nSwathLines < nBlockYSize)
    4568             :         {
    4569         142 :             nSwathLines = nBlockYSize;
    4570             : 
    4571             :             // Number of pixels that can be read/write simultaneously.
    4572         142 :             nSwathCols = nTargetSwathSize / (nSwathLines * nPixelSize);
    4573         142 :             nSwathCols = ROUND_TO(nSwathCols, nBlockXSize);
    4574         142 :             if (nSwathCols == 0)
    4575           0 :                 nSwathCols = nBlockXSize;
    4576         142 :             if (nSwathCols > nXSize)
    4577         142 :                 nSwathCols = nXSize;
    4578             : 
    4579         142 :             CPLDebug(
    4580             :                 "GDAL",
    4581             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression and "
    4582             :                 "too high block, "
    4583             :                 "use partial width at one time");
    4584             :         }
    4585         224 :         else if ((nSwathLines % nBlockYSize) != 0)
    4586             :         {
    4587             :             // Round on a multiple of nBlockYSize.
    4588           9 :             nSwathLines = ROUND_TO(nSwathLines, nBlockYSize);
    4589           9 :             CPLDebug(
    4590             :                 "GDAL",
    4591             :                 "GDALCopyWholeRasterGetSwathSize(): because of compression, "
    4592             :                 "round nSwathLines to block height : %d",
    4593             :                 nSwathLines);
    4594             :         }
    4595             :     }
    4596             : 
    4597        2895 :     *pnSwathCols = nSwathCols;
    4598        2895 :     *pnSwathLines = nSwathLines;
    4599        2895 : }
    4600             : 
    4601             : /************************************************************************/
    4602             : /*                     GDALDatasetCopyWholeRaster()                     */
    4603             : /************************************************************************/
    4604             : 
    4605             : /**
    4606             :  * \brief Copy all dataset raster data.
    4607             :  *
    4608             :  * This function copies the complete raster contents of one dataset to
    4609             :  * another similarly configured dataset.  The source and destination
    4610             :  * dataset must have the same number of bands, and the same width
    4611             :  * and height.  The bands do not have to have the same data type.
    4612             :  *
    4613             :  * This function is primarily intended to support implementation of
    4614             :  * driver specific CreateCopy() functions.  It implements efficient copying,
    4615             :  * in particular "chunking" the copy in substantial blocks and, if appropriate,
    4616             :  * performing the transfer in a pixel interleaved fashion.
    4617             :  *
    4618             :  * Currently the only papszOptions value supported are :
    4619             :  * <ul>
    4620             :  * <li>"INTERLEAVE=PIXEL/BAND" to force pixel (resp. band) interleaved read and
    4621             :  * write access pattern (this does not modify the layout of the destination
    4622             :  * data)</li> <li>"COMPRESSED=YES" to force alignment on target dataset block
    4623             :  * sizes to achieve best compression.</li> <li>"SKIP_HOLES=YES" to skip chunks
    4624             :  * for which GDALGetDataCoverageStatus() returns GDAL_DATA_COVERAGE_STATUS_EMPTY
    4625             :  * (GDAL &gt;= 2.2)</li>
    4626             :  * </ul>
    4627             :  * More options may be supported in the future.
    4628             :  *
    4629             :  * @param hSrcDS the source dataset
    4630             :  * @param hDstDS the destination dataset
    4631             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    4632             :  * @param pfnProgress progress reporting function.
    4633             :  * @param pProgressData callback data for progress function.
    4634             :  *
    4635             :  * @return CE_None on success, or CE_Failure on failure.
    4636             :  */
    4637             : 
    4638        2867 : CPLErr CPL_STDCALL GDALDatasetCopyWholeRaster(GDALDatasetH hSrcDS,
    4639             :                                               GDALDatasetH hDstDS,
    4640             :                                               CSLConstList papszOptions,
    4641             :                                               GDALProgressFunc pfnProgress,
    4642             :                                               void *pProgressData)
    4643             : 
    4644             : {
    4645        2867 :     VALIDATE_POINTER1(hSrcDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    4646        2867 :     VALIDATE_POINTER1(hDstDS, "GDALDatasetCopyWholeRaster", CE_Failure);
    4647             : 
    4648        2867 :     GDALDataset *poSrcDS = GDALDataset::FromHandle(hSrcDS);
    4649        2867 :     GDALDataset *poDstDS = GDALDataset::FromHandle(hDstDS);
    4650             : 
    4651        2867 :     if (pfnProgress == nullptr)
    4652           3 :         pfnProgress = GDALDummyProgress;
    4653             : 
    4654             :     /* -------------------------------------------------------------------- */
    4655             :     /*      Confirm the datasets match in size and band counts.             */
    4656             :     /* -------------------------------------------------------------------- */
    4657        2867 :     const int nXSize = poDstDS->GetRasterXSize();
    4658        2867 :     const int nYSize = poDstDS->GetRasterYSize();
    4659        2867 :     const int nBandCount = poDstDS->GetRasterCount();
    4660             : 
    4661        2867 :     if (poSrcDS->GetRasterXSize() != nXSize ||
    4662        5734 :         poSrcDS->GetRasterYSize() != nYSize ||
    4663        2867 :         poSrcDS->GetRasterCount() != nBandCount)
    4664             :     {
    4665           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    4666             :                  "Input and output dataset sizes or band counts do not\n"
    4667             :                  "match in GDALDatasetCopyWholeRaster()");
    4668           0 :         return CE_Failure;
    4669             :     }
    4670             : 
    4671             :     /* -------------------------------------------------------------------- */
    4672             :     /*      Report preliminary (0) progress.                                */
    4673             :     /* -------------------------------------------------------------------- */
    4674        2867 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    4675             :     {
    4676           1 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    4677             :                  "User terminated CreateCopy()");
    4678           1 :         return CE_Failure;
    4679             :     }
    4680             : 
    4681             :     /* -------------------------------------------------------------------- */
    4682             :     /*      Get our prototype band, and assume the others are similarly     */
    4683             :     /*      configured.                                                     */
    4684             :     /* -------------------------------------------------------------------- */
    4685        2866 :     if (nBandCount == 0)
    4686           0 :         return CE_None;
    4687             : 
    4688        2866 :     GDALRasterBand *poSrcPrototypeBand = poSrcDS->GetRasterBand(1);
    4689        2866 :     GDALRasterBand *poDstPrototypeBand = poDstDS->GetRasterBand(1);
    4690        2866 :     GDALDataType eDT = poDstPrototypeBand->GetRasterDataType();
    4691             : 
    4692             :     /* -------------------------------------------------------------------- */
    4693             :     /*      Do we want to try and do the operation in a pixel               */
    4694             :     /*      interleaved fashion?                                            */
    4695             :     /* -------------------------------------------------------------------- */
    4696        2866 :     bool bInterleave = false;
    4697             :     const char *pszInterleave =
    4698        2866 :         poSrcDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    4699        2866 :     if (pszInterleave != nullptr &&
    4700        1129 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    4701         170 :         bInterleave = true;
    4702             : 
    4703        2866 :     pszInterleave = poDstDS->GetMetadataItem("INTERLEAVE", "IMAGE_STRUCTURE");
    4704        2866 :     if (pszInterleave != nullptr &&
    4705        2062 :         (EQUAL(pszInterleave, "PIXEL") || EQUAL(pszInterleave, "LINE")))
    4706        1293 :         bInterleave = true;
    4707             : 
    4708        2866 :     pszInterleave = CSLFetchNameValue(papszOptions, "INTERLEAVE");
    4709        2866 :     if (pszInterleave != nullptr && EQUAL(pszInterleave, "PIXEL"))
    4710           5 :         bInterleave = true;
    4711        2861 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "BAND"))
    4712           7 :         bInterleave = false;
    4713             :     // attributes is specific to the TileDB driver
    4714        2854 :     else if (pszInterleave != nullptr && EQUAL(pszInterleave, "ATTRIBUTES"))
    4715           4 :         bInterleave = true;
    4716        2850 :     else if (pszInterleave != nullptr)
    4717             :     {
    4718           0 :         CPLError(CE_Warning, CPLE_NotSupported,
    4719             :                  "Unsupported value for option INTERLEAVE");
    4720             :     }
    4721             : 
    4722             :     // If the destination is compressed, we must try to write blocks just once,
    4723             :     // to save disk space (GTiff case for example), and to avoid data loss
    4724             :     // (JPEG compression for example).
    4725        2866 :     bool bDstIsCompressed = false;
    4726             :     const char *pszDstCompressed =
    4727        2866 :         CSLFetchNameValue(papszOptions, "COMPRESSED");
    4728        2866 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    4729         349 :         bDstIsCompressed = true;
    4730             : 
    4731             :     /* -------------------------------------------------------------------- */
    4732             :     /*      What will our swath size be?                                    */
    4733             :     /* -------------------------------------------------------------------- */
    4734             : 
    4735        2866 :     int nSwathCols = 0;
    4736        2866 :     int nSwathLines = 0;
    4737        2866 :     GDALCopyWholeRasterGetSwathSize(poSrcPrototypeBand, poDstPrototypeBand,
    4738             :                                     nBandCount, bDstIsCompressed, bInterleave,
    4739             :                                     &nSwathCols, &nSwathLines);
    4740             : 
    4741        2866 :     int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    4742        2866 :     if (bInterleave)
    4743        1340 :         nPixelSize *= nBandCount;
    4744             : 
    4745        2866 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    4746        2866 :     if (pSwathBuf == nullptr)
    4747             :     {
    4748           0 :         return CE_Failure;
    4749             :     }
    4750             : 
    4751        2866 :     CPLDebug("GDAL",
    4752             :              "GDALDatasetCopyWholeRaster(): %d*%d swaths, bInterleave=%d",
    4753             :              nSwathCols, nSwathLines, static_cast<int>(bInterleave));
    4754             : 
    4755             :     // Advise the source raster that we are going to read it completely
    4756             :     // Note: this might already have been done by GDALCreateCopy() in the
    4757             :     // likely case this function is indirectly called by it
    4758        2866 :     poSrcDS->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nBandCount,
    4759        2866 :                         nullptr, nullptr);
    4760             : 
    4761             :     /* ==================================================================== */
    4762             :     /*      Band oriented (uninterleaved) case.                             */
    4763             :     /* ==================================================================== */
    4764        2866 :     CPLErr eErr = CE_None;
    4765             :     const bool bCheckHoles =
    4766        2866 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    4767             : 
    4768        2866 :     if (!bInterleave)
    4769             :     {
    4770             :         GDALRasterIOExtraArg sExtraArg;
    4771        1526 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    4772        1526 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    4773             : 
    4774        4578 :         const GIntBig nTotalBlocks = static_cast<GIntBig>(nBandCount) *
    4775        1526 :                                      DIV_ROUND_UP(nYSize, nSwathLines) *
    4776        1526 :                                      DIV_ROUND_UP(nXSize, nSwathCols);
    4777        1526 :         GIntBig nBlocksDone = 0;
    4778             : 
    4779        3756 :         for (int iBand = 0; iBand < nBandCount && eErr == CE_None; iBand++)
    4780             :         {
    4781        2230 :             int nBand = iBand + 1;
    4782             : 
    4783        4618 :             for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4784             :             {
    4785        2388 :                 int nThisLines = nSwathLines;
    4786             : 
    4787        2388 :                 if (iY + nThisLines > nYSize)
    4788         274 :                     nThisLines = nYSize - iY;
    4789             : 
    4790        4776 :                 for (int iX = 0; iX < nXSize && eErr == CE_None;
    4791        2388 :                      iX += nSwathCols)
    4792             :                 {
    4793        2388 :                     int nThisCols = nSwathCols;
    4794             : 
    4795        2388 :                     if (iX + nThisCols > nXSize)
    4796           0 :                         nThisCols = nXSize - iX;
    4797             : 
    4798        2388 :                     int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    4799        2388 :                     if (bCheckHoles)
    4800             :                     {
    4801             :                         nStatus = poSrcDS->GetRasterBand(nBand)
    4802         960 :                                       ->GetDataCoverageStatus(
    4803             :                                           iX, iY, nThisCols, nThisLines,
    4804             :                                           GDAL_DATA_COVERAGE_STATUS_DATA);
    4805             :                     }
    4806        2388 :                     if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4807             :                     {
    4808        2384 :                         sExtraArg.pfnProgress = GDALScaledProgress;
    4809        4768 :                         sExtraArg.pProgressData = GDALCreateScaledProgress(
    4810        2384 :                             nBlocksDone / static_cast<double>(nTotalBlocks),
    4811        2384 :                             (nBlocksDone + 0.5) /
    4812        2384 :                                 static_cast<double>(nTotalBlocks),
    4813             :                             pfnProgress, pProgressData);
    4814        2384 :                         if (sExtraArg.pProgressData == nullptr)
    4815        1413 :                             sExtraArg.pfnProgress = nullptr;
    4816             : 
    4817        2384 :                         eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    4818             :                                                  nThisLines, pSwathBuf,
    4819             :                                                  nThisCols, nThisLines, eDT, 1,
    4820             :                                                  &nBand, 0, 0, 0, &sExtraArg);
    4821             : 
    4822        2384 :                         GDALDestroyScaledProgress(sExtraArg.pProgressData);
    4823             : 
    4824        2384 :                         if (eErr == CE_None)
    4825        2380 :                             eErr = poDstDS->RasterIO(
    4826             :                                 GF_Write, iX, iY, nThisCols, nThisLines,
    4827             :                                 pSwathBuf, nThisCols, nThisLines, eDT, 1,
    4828             :                                 &nBand, 0, 0, 0, nullptr);
    4829             :                     }
    4830             : 
    4831        2388 :                     nBlocksDone++;
    4832        4737 :                     if (eErr == CE_None &&
    4833        2349 :                         !pfnProgress(nBlocksDone /
    4834        2349 :                                          static_cast<double>(nTotalBlocks),
    4835             :                                      nullptr, pProgressData))
    4836             :                     {
    4837           3 :                         eErr = CE_Failure;
    4838           3 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    4839             :                                  "User terminated CreateCopy()");
    4840             :                     }
    4841             :                 }
    4842             :             }
    4843             :         }
    4844             :     }
    4845             : 
    4846             :     /* ==================================================================== */
    4847             :     /*      Pixel interleaved case.                                         */
    4848             :     /* ==================================================================== */
    4849             :     else /* if( bInterleave ) */
    4850             :     {
    4851             :         GDALRasterIOExtraArg sExtraArg;
    4852        1340 :         INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    4853        1340 :         CPL_IGNORE_RET_VAL(sExtraArg.pfnProgress);  // to make cppcheck happy
    4854             : 
    4855        1340 :         const GIntBig nTotalBlocks =
    4856        1340 :             static_cast<GIntBig>(DIV_ROUND_UP(nYSize, nSwathLines)) *
    4857        1340 :             DIV_ROUND_UP(nXSize, nSwathCols);
    4858        1340 :         GIntBig nBlocksDone = 0;
    4859             : 
    4860        2906 :         for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    4861             :         {
    4862        1566 :             int nThisLines = nSwathLines;
    4863             : 
    4864        1566 :             if (iY + nThisLines > nYSize)
    4865         204 :                 nThisLines = nYSize - iY;
    4866             : 
    4867        3137 :             for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    4868             :             {
    4869        1571 :                 int nThisCols = nSwathCols;
    4870             : 
    4871        1571 :                 if (iX + nThisCols > nXSize)
    4872           3 :                     nThisCols = nXSize - iX;
    4873             : 
    4874        1571 :                 int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    4875        1571 :                 if (bCheckHoles)
    4876             :                 {
    4877        1344 :                     nStatus = 0;
    4878        1397 :                     for (int iBand = 0; iBand < nBandCount; iBand++)
    4879             :                     {
    4880        1378 :                         nStatus |= poSrcDS->GetRasterBand(iBand + 1)
    4881        1378 :                                        ->GetDataCoverageStatus(
    4882             :                                            iX, iY, nThisCols, nThisLines,
    4883             :                                            GDAL_DATA_COVERAGE_STATUS_DATA);
    4884        1378 :                         if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4885        1325 :                             break;
    4886             :                     }
    4887             :                 }
    4888        1571 :                 if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    4889             :                 {
    4890        1552 :                     sExtraArg.pfnProgress = GDALScaledProgress;
    4891        3104 :                     sExtraArg.pProgressData = GDALCreateScaledProgress(
    4892        1552 :                         nBlocksDone / static_cast<double>(nTotalBlocks),
    4893        1552 :                         (nBlocksDone + 0.5) / static_cast<double>(nTotalBlocks),
    4894             :                         pfnProgress, pProgressData);
    4895        1552 :                     if (sExtraArg.pProgressData == nullptr)
    4896         331 :                         sExtraArg.pfnProgress = nullptr;
    4897             : 
    4898        1552 :                     eErr = poSrcDS->RasterIO(GF_Read, iX, iY, nThisCols,
    4899             :                                              nThisLines, pSwathBuf, nThisCols,
    4900             :                                              nThisLines, eDT, nBandCount,
    4901             :                                              nullptr, 0, 0, 0, &sExtraArg);
    4902             : 
    4903        1552 :                     GDALDestroyScaledProgress(sExtraArg.pProgressData);
    4904             : 
    4905        1552 :                     if (eErr == CE_None)
    4906        1551 :                         eErr = poDstDS->RasterIO(
    4907             :                             GF_Write, iX, iY, nThisCols, nThisLines, pSwathBuf,
    4908             :                             nThisCols, nThisLines, eDT, nBandCount, nullptr, 0,
    4909             :                             0, 0, nullptr);
    4910             :                 }
    4911             : 
    4912        1571 :                 nBlocksDone++;
    4913        3138 :                 if (eErr == CE_None &&
    4914        1567 :                     !pfnProgress(nBlocksDone /
    4915        1567 :                                      static_cast<double>(nTotalBlocks),
    4916             :                                  nullptr, pProgressData))
    4917             :                 {
    4918           1 :                     eErr = CE_Failure;
    4919           1 :                     CPLError(CE_Failure, CPLE_UserInterrupt,
    4920             :                              "User terminated CreateCopy()");
    4921             :                 }
    4922             :             }
    4923             :         }
    4924             :     }
    4925             : 
    4926             :     /* -------------------------------------------------------------------- */
    4927             :     /*      Cleanup                                                         */
    4928             :     /* -------------------------------------------------------------------- */
    4929        2866 :     CPLFree(pSwathBuf);
    4930             : 
    4931        2866 :     return eErr;
    4932             : }
    4933             : 
    4934             : /************************************************************************/
    4935             : /*                     GDALRasterBandCopyWholeRaster()                  */
    4936             : /************************************************************************/
    4937             : 
    4938             : /**
    4939             :  * \brief Copy a whole raster band
    4940             :  *
    4941             :  * This function copies the complete raster contents of one band to
    4942             :  * another similarly configured band.  The source and destination
    4943             :  * bands must have the same width and height.  The bands do not have
    4944             :  * to have the same data type.
    4945             :  *
    4946             :  * It implements efficient copying, in particular "chunking" the copy in
    4947             :  * substantial blocks.
    4948             :  *
    4949             :  * Currently the only papszOptions value supported are :
    4950             :  * <ul>
    4951             :  * <li>"COMPRESSED=YES" to force alignment on target dataset block sizes to
    4952             :  * achieve best compression.</li>
    4953             :  * <li>"SKIP_HOLES=YES" to skip chunks for which GDALGetDataCoverageStatus()
    4954             :  * returns GDAL_DATA_COVERAGE_STATUS_EMPTY (GDAL &gt;= 2.2)</li>
    4955             :  * </ul>
    4956             :  *
    4957             :  * @param hSrcBand the source band
    4958             :  * @param hDstBand the destination band
    4959             :  * @param papszOptions transfer hints in "StringList" Name=Value format.
    4960             :  * @param pfnProgress progress reporting function.
    4961             :  * @param pProgressData callback data for progress function.
    4962             :  *
    4963             :  * @return CE_None on success, or CE_Failure on failure.
    4964             :  */
    4965             : 
    4966          29 : CPLErr CPL_STDCALL GDALRasterBandCopyWholeRaster(
    4967             :     GDALRasterBandH hSrcBand, GDALRasterBandH hDstBand,
    4968             :     const char *const *const papszOptions, GDALProgressFunc pfnProgress,
    4969             :     void *pProgressData)
    4970             : 
    4971             : {
    4972          29 :     VALIDATE_POINTER1(hSrcBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    4973          29 :     VALIDATE_POINTER1(hDstBand, "GDALRasterBandCopyWholeRaster", CE_Failure);
    4974             : 
    4975          29 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4976          29 :     GDALRasterBand *poDstBand = GDALRasterBand::FromHandle(hDstBand);
    4977          29 :     CPLErr eErr = CE_None;
    4978             : 
    4979          29 :     if (pfnProgress == nullptr)
    4980          11 :         pfnProgress = GDALDummyProgress;
    4981             : 
    4982             :     /* -------------------------------------------------------------------- */
    4983             :     /*      Confirm the datasets match in size and band counts.             */
    4984             :     /* -------------------------------------------------------------------- */
    4985          29 :     int nXSize = poSrcBand->GetXSize();
    4986          29 :     int nYSize = poSrcBand->GetYSize();
    4987             : 
    4988          29 :     if (poDstBand->GetXSize() != nXSize || poDstBand->GetYSize() != nYSize)
    4989             :     {
    4990           0 :         CPLError(CE_Failure, CPLE_AppDefined,
    4991             :                  "Input and output band sizes do not\n"
    4992             :                  "match in GDALRasterBandCopyWholeRaster()");
    4993           0 :         return CE_Failure;
    4994             :     }
    4995             : 
    4996             :     /* -------------------------------------------------------------------- */
    4997             :     /*      Report preliminary (0) progress.                                */
    4998             :     /* -------------------------------------------------------------------- */
    4999          29 :     if (!pfnProgress(0.0, nullptr, pProgressData))
    5000             :     {
    5001           0 :         CPLError(CE_Failure, CPLE_UserInterrupt,
    5002             :                  "User terminated CreateCopy()");
    5003           0 :         return CE_Failure;
    5004             :     }
    5005             : 
    5006          29 :     GDALDataType eDT = poDstBand->GetRasterDataType();
    5007             : 
    5008             :     // If the destination is compressed, we must try to write blocks just once,
    5009             :     // to save disk space (GTiff case for example), and to avoid data loss
    5010             :     // (JPEG compression for example).
    5011          29 :     bool bDstIsCompressed = false;
    5012             :     const char *pszDstCompressed =
    5013          29 :         CSLFetchNameValue(const_cast<char **>(papszOptions), "COMPRESSED");
    5014          29 :     if (pszDstCompressed != nullptr && CPLTestBool(pszDstCompressed))
    5015          17 :         bDstIsCompressed = true;
    5016             : 
    5017             :     /* -------------------------------------------------------------------- */
    5018             :     /*      What will our swath size be?                                    */
    5019             :     /* -------------------------------------------------------------------- */
    5020             : 
    5021          29 :     int nSwathCols = 0;
    5022          29 :     int nSwathLines = 0;
    5023          29 :     GDALCopyWholeRasterGetSwathSize(poSrcBand, poDstBand, 1, bDstIsCompressed,
    5024             :                                     FALSE, &nSwathCols, &nSwathLines);
    5025             : 
    5026          29 :     const int nPixelSize = GDALGetDataTypeSizeBytes(eDT);
    5027             : 
    5028          29 :     void *pSwathBuf = VSI_MALLOC3_VERBOSE(nSwathCols, nSwathLines, nPixelSize);
    5029          29 :     if (pSwathBuf == nullptr)
    5030             :     {
    5031           0 :         return CE_Failure;
    5032             :     }
    5033             : 
    5034          29 :     CPLDebug("GDAL", "GDALRasterBandCopyWholeRaster(): %d*%d swaths",
    5035             :              nSwathCols, nSwathLines);
    5036             : 
    5037             :     const bool bCheckHoles =
    5038          29 :         CPLTestBool(CSLFetchNameValueDef(papszOptions, "SKIP_HOLES", "NO"));
    5039             : 
    5040             :     // Advise the source raster that we are going to read it completely
    5041          29 :     poSrcBand->AdviseRead(0, 0, nXSize, nYSize, nXSize, nYSize, eDT, nullptr);
    5042             : 
    5043             :     /* ==================================================================== */
    5044             :     /*      Band oriented (uninterleaved) case.                             */
    5045             :     /* ==================================================================== */
    5046             : 
    5047          67 :     for (int iY = 0; iY < nYSize && eErr == CE_None; iY += nSwathLines)
    5048             :     {
    5049          38 :         int nThisLines = nSwathLines;
    5050             : 
    5051          38 :         if (iY + nThisLines > nYSize)
    5052           3 :             nThisLines = nYSize - iY;
    5053             : 
    5054          76 :         for (int iX = 0; iX < nXSize && eErr == CE_None; iX += nSwathCols)
    5055             :         {
    5056          38 :             int nThisCols = nSwathCols;
    5057             : 
    5058          38 :             if (iX + nThisCols > nXSize)
    5059           0 :                 nThisCols = nXSize - iX;
    5060             : 
    5061          38 :             int nStatus = GDAL_DATA_COVERAGE_STATUS_DATA;
    5062          38 :             if (bCheckHoles)
    5063             :             {
    5064           0 :                 nStatus = poSrcBand->GetDataCoverageStatus(
    5065             :                     iX, iY, nThisCols, nThisLines,
    5066             :                     GDAL_DATA_COVERAGE_STATUS_DATA);
    5067             :             }
    5068          38 :             if (nStatus & GDAL_DATA_COVERAGE_STATUS_DATA)
    5069             :             {
    5070          38 :                 eErr = poSrcBand->RasterIO(GF_Read, iX, iY, nThisCols,
    5071             :                                            nThisLines, pSwathBuf, nThisCols,
    5072             :                                            nThisLines, eDT, 0, 0, nullptr);
    5073             : 
    5074          38 :                 if (eErr == CE_None)
    5075          38 :                     eErr = poDstBand->RasterIO(GF_Write, iX, iY, nThisCols,
    5076             :                                                nThisLines, pSwathBuf, nThisCols,
    5077             :                                                nThisLines, eDT, 0, 0, nullptr);
    5078             :             }
    5079             : 
    5080          76 :             if (eErr == CE_None &&
    5081          38 :                 !pfnProgress((iY + nThisLines) / static_cast<float>(nYSize),
    5082             :                              nullptr, pProgressData))
    5083             :             {
    5084           0 :                 eErr = CE_Failure;
    5085           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt,
    5086             :                          "User terminated CreateCopy()");
    5087             :             }
    5088             :         }
    5089             :     }
    5090             : 
    5091             :     /* -------------------------------------------------------------------- */
    5092             :     /*      Cleanup                                                         */
    5093             :     /* -------------------------------------------------------------------- */
    5094          29 :     CPLFree(pSwathBuf);
    5095             : 
    5096          29 :     return eErr;
    5097             : }
    5098             : 
    5099             : /************************************************************************/
    5100             : /*                      GDALCopyRasterIOExtraArg ()                     */
    5101             : /************************************************************************/
    5102             : 
    5103      323330 : void GDALCopyRasterIOExtraArg(GDALRasterIOExtraArg *psDestArg,
    5104             :                               GDALRasterIOExtraArg *psSrcArg)
    5105             : {
    5106      323330 :     INIT_RASTERIO_EXTRA_ARG(*psDestArg);
    5107      323330 :     if (psSrcArg)
    5108             :     {
    5109      323330 :         psDestArg->eResampleAlg = psSrcArg->eResampleAlg;
    5110      323330 :         psDestArg->pfnProgress = psSrcArg->pfnProgress;
    5111      323330 :         psDestArg->pProgressData = psSrcArg->pProgressData;
    5112      323330 :         psDestArg->bFloatingPointWindowValidity =
    5113      323330 :             psSrcArg->bFloatingPointWindowValidity;
    5114      323330 :         if (psSrcArg->bFloatingPointWindowValidity)
    5115             :         {
    5116        3116 :             psDestArg->dfXOff = psSrcArg->dfXOff;
    5117        3116 :             psDestArg->dfYOff = psSrcArg->dfYOff;
    5118        3116 :             psDestArg->dfXSize = psSrcArg->dfXSize;
    5119        3116 :             psDestArg->dfYSize = psSrcArg->dfYSize;
    5120             :         }
    5121             :     }
    5122      323330 : }
    5123             : 
    5124             : /************************************************************************/
    5125             : /*                         HasOnlyNoData()                              */
    5126             : /************************************************************************/
    5127             : 
    5128    24858042 : template <class T> static inline bool IsEqualToNoData(T value, T noDataValue)
    5129             : {
    5130    24858042 :     return value == noDataValue;
    5131             : }
    5132             : 
    5133      560303 : template <> bool IsEqualToNoData<float>(float value, float noDataValue)
    5134             : {
    5135      560303 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5136             : }
    5137             : 
    5138      501120 : template <> bool IsEqualToNoData<double>(double value, double noDataValue)
    5139             : {
    5140      501120 :     return std::isnan(noDataValue) ? std::isnan(value) : value == noDataValue;
    5141             : }
    5142             : 
    5143             : template <class T>
    5144       12090 : static bool HasOnlyNoDataT(const T *pBuffer, T noDataValue, size_t nWidth,
    5145             :                            size_t nHeight, size_t nLineStride,
    5146             :                            size_t nComponents)
    5147             : {
    5148             :     // Fast test: check the 4 corners and the middle pixel.
    5149       23430 :     for (size_t iBand = 0; iBand < nComponents; iBand++)
    5150             :     {
    5151       24917 :         if (!(IsEqualToNoData(pBuffer[iBand], noDataValue) &&
    5152       12373 :               IsEqualToNoData(pBuffer[(nWidth - 1) * nComponents + iBand],
    5153       12284 :                               noDataValue) &&
    5154       12284 :               IsEqualToNoData(
    5155       12284 :                   pBuffer[((nHeight - 1) / 2 * nLineStride + (nWidth - 1) / 2) *
    5156       12284 :                               nComponents +
    5157             :                           iBand],
    5158       11351 :                   noDataValue) &&
    5159       11351 :               IsEqualToNoData(
    5160       11351 :                   pBuffer[(nHeight - 1) * nLineStride * nComponents + iBand],
    5161             :                   noDataValue) &&
    5162       11343 :               IsEqualToNoData(
    5163       11343 :                   pBuffer[((nHeight - 1) * nLineStride + nWidth - 1) *
    5164       11343 :                               nComponents +
    5165             :                           iBand],
    5166             :                   noDataValue)))
    5167             :         {
    5168        1204 :             return false;
    5169             :         }
    5170             :     }
    5171             : 
    5172             :     // Test all pixels.
    5173       37776 :     for (size_t iY = 0; iY < nHeight; iY++)
    5174             :     {
    5175       26914 :         const T *pBufferLine = pBuffer + iY * nLineStride * nComponents;
    5176    25886447 :         for (size_t iX = 0; iX < nWidth * nComponents; iX++)
    5177             :         {
    5178    25859623 :             if (!IsEqualToNoData(pBufferLine[iX], noDataValue))
    5179             :             {
    5180          24 :                 return false;
    5181             :             }
    5182             :         }
    5183             :     }
    5184       10862 :     return true;
    5185             : }
    5186             : 
    5187             : /************************************************************************/
    5188             : /*                    GDALBufferHasOnlyNoData()                         */
    5189             : /************************************************************************/
    5190             : 
    5191       35354 : bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
    5192             :                              size_t nWidth, size_t nHeight, size_t nLineStride,
    5193             :                              size_t nComponents, int nBitsPerSample,
    5194             :                              GDALBufferSampleFormat nSampleFormat)
    5195             : {
    5196             :     // In the case where the nodata is 0, we can compare several bytes at
    5197             :     // once. Select the largest natural integer type for the architecture.
    5198             : #if SIZEOF_VOIDP >= 8 || defined(__x86_64__)
    5199             :     // We test __x86_64__ for x32 arch where SIZEOF_VOIDP == 4
    5200             :     typedef std::uint64_t WordType;
    5201             : #else
    5202             :     typedef std::uint32_t WordType;
    5203             : #endif
    5204       35354 :     if (dfNoDataValue == 0.0 && nWidth == nLineStride &&
    5205             :         // Do not use this optimized code path for floating point numbers,
    5206             :         // as it can't detect negative zero.
    5207             :         nSampleFormat != GSF_FLOATING_POINT)
    5208             :     {
    5209       23258 :         const GByte *pabyBuffer = static_cast<const GByte *>(pBuffer);
    5210       23258 :         const size_t nSize =
    5211       23258 :             (nWidth * nHeight * nComponents * nBitsPerSample + 7) / 8;
    5212       23258 :         size_t i = 0;
    5213             :         const size_t nInitialIters =
    5214       46516 :             std::min(sizeof(WordType) -
    5215       23258 :                          static_cast<size_t>(
    5216             :                              reinterpret_cast<std::uintptr_t>(pabyBuffer) %
    5217             :                              sizeof(WordType)),
    5218       23258 :                      nSize);
    5219      194736 :         for (; i < nInitialIters; i++)
    5220             :         {
    5221      174773 :             if (pabyBuffer[i])
    5222        3295 :                 return false;
    5223             :         }
    5224    16095400 :         for (; i + sizeof(WordType) - 1 < nSize; i += sizeof(WordType))
    5225             :         {
    5226    16080900 :             if (*(reinterpret_cast<const WordType *>(pabyBuffer + i)))
    5227        5428 :                 return false;
    5228             :         }
    5229       51884 :         for (; i < nSize; i++)
    5230             :         {
    5231       37354 :             if (pabyBuffer[i])
    5232           5 :                 return false;
    5233             :         }
    5234       14530 :         return true;
    5235             :     }
    5236             : 
    5237       12096 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_UNSIGNED_INT)
    5238             :     {
    5239       22234 :         return GDALIsValueInRange<uint8_t>(dfNoDataValue) &&
    5240       11117 :                HasOnlyNoDataT(static_cast<const uint8_t *>(pBuffer),
    5241       11117 :                               static_cast<uint8_t>(dfNoDataValue), nWidth,
    5242       11117 :                               nHeight, nLineStride, nComponents);
    5243             :     }
    5244         979 :     if (nBitsPerSample == 8 && nSampleFormat == GSF_SIGNED_INT)
    5245             :     {
    5246             :         // Use unsigned implementation by converting the nodatavalue to
    5247             :         // unsigned
    5248          63 :         return GDALIsValueInRange<int8_t>(dfNoDataValue) &&
    5249          31 :                HasOnlyNoDataT(
    5250             :                    static_cast<const uint8_t *>(pBuffer),
    5251          31 :                    static_cast<uint8_t>(static_cast<int8_t>(dfNoDataValue)),
    5252          32 :                    nWidth, nHeight, nLineStride, nComponents);
    5253             :     }
    5254         947 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_UNSIGNED_INT)
    5255             :     {
    5256          21 :         return GDALIsValueInRange<uint16_t>(dfNoDataValue) &&
    5257          10 :                HasOnlyNoDataT(static_cast<const uint16_t *>(pBuffer),
    5258          10 :                               static_cast<uint16_t>(dfNoDataValue), nWidth,
    5259          11 :                               nHeight, nLineStride, nComponents);
    5260             :     }
    5261         936 :     if (nBitsPerSample == 16 && nSampleFormat == GSF_SIGNED_INT)
    5262             :     {
    5263             :         // Use unsigned implementation by converting the nodatavalue to
    5264             :         // unsigned
    5265         109 :         return GDALIsValueInRange<int16_t>(dfNoDataValue) &&
    5266          54 :                HasOnlyNoDataT(
    5267             :                    static_cast<const uint16_t *>(pBuffer),
    5268          54 :                    static_cast<uint16_t>(static_cast<int16_t>(dfNoDataValue)),
    5269          55 :                    nWidth, nHeight, nLineStride, nComponents);
    5270             :     }
    5271         881 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_UNSIGNED_INT)
    5272             :     {
    5273          73 :         return GDALIsValueInRange<uint32_t>(dfNoDataValue) &&
    5274          36 :                HasOnlyNoDataT(static_cast<const uint32_t *>(pBuffer),
    5275             :                               static_cast<uint32_t>(dfNoDataValue), nWidth,
    5276          37 :                               nHeight, nLineStride, nComponents);
    5277             :     }
    5278         844 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_SIGNED_INT)
    5279             :     {
    5280             :         // Use unsigned implementation by converting the nodatavalue to
    5281             :         // unsigned
    5282          19 :         return GDALIsValueInRange<int32_t>(dfNoDataValue) &&
    5283           9 :                HasOnlyNoDataT(
    5284             :                    static_cast<const uint32_t *>(pBuffer),
    5285           9 :                    static_cast<uint32_t>(static_cast<int32_t>(dfNoDataValue)),
    5286          10 :                    nWidth, nHeight, nLineStride, nComponents);
    5287             :     }
    5288         834 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_UNSIGNED_INT)
    5289             :     {
    5290          56 :         return GDALIsValueInRange<uint64_t>(dfNoDataValue) &&
    5291          28 :                HasOnlyNoDataT(static_cast<const uint64_t *>(pBuffer),
    5292             :                               static_cast<uint64_t>(dfNoDataValue), nWidth,
    5293          28 :                               nHeight, nLineStride, nComponents);
    5294             :     }
    5295         806 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_SIGNED_INT)
    5296             :     {
    5297             :         // Use unsigned implementation by converting the nodatavalue to
    5298             :         // unsigned
    5299           0 :         return GDALIsValueInRange<int64_t>(dfNoDataValue) &&
    5300           0 :                HasOnlyNoDataT(
    5301             :                    static_cast<const uint64_t *>(pBuffer),
    5302           0 :                    static_cast<uint64_t>(static_cast<int64_t>(dfNoDataValue)),
    5303           0 :                    nWidth, nHeight, nLineStride, nComponents);
    5304             :     }
    5305         806 :     if (nBitsPerSample == 32 && nSampleFormat == GSF_FLOATING_POINT)
    5306             :     {
    5307        1341 :         return (std::isnan(dfNoDataValue) ||
    5308        1370 :                 GDALIsValueInRange<float>(dfNoDataValue)) &&
    5309         684 :                HasOnlyNoDataT(static_cast<const float *>(pBuffer),
    5310             :                               static_cast<float>(dfNoDataValue), nWidth,
    5311         685 :                               nHeight, nLineStride, nComponents);
    5312             :     }
    5313         121 :     if (nBitsPerSample == 64 && nSampleFormat == GSF_FLOATING_POINT)
    5314             :     {
    5315         121 :         return HasOnlyNoDataT(static_cast<const double *>(pBuffer),
    5316             :                               dfNoDataValue, nWidth, nHeight, nLineStride,
    5317         121 :                               nComponents);
    5318             :     }
    5319           0 :     return false;
    5320             : }
    5321             : 
    5322             : #ifdef HAVE_SSE2
    5323             : 
    5324             : /************************************************************************/
    5325             : /*                    GDALDeinterleave3Byte()                           */
    5326             : /************************************************************************/
    5327             : 
    5328             : #if defined(__GNUC__) && !defined(__clang__)
    5329             : __attribute__((optimize("no-tree-vectorize")))
    5330             : #endif
    5331             : static void
    5332       69765 : GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5333             :                       GByte *CPL_RESTRICT pabyDest0,
    5334             :                       GByte *CPL_RESTRICT pabyDest1,
    5335             :                       GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5336             : #ifdef USE_NEON_OPTIMIZATIONS
    5337             : {
    5338             :     return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5339             :                                        nIters);
    5340             : }
    5341             : #else
    5342             : {
    5343             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5344       69765 :     if (CPLHaveRuntimeSSSE3())
    5345             :     {
    5346       69804 :         return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5347       69795 :                                            pabyDest2, nIters);
    5348             :     }
    5349             : #endif
    5350             : 
    5351           2 :     size_t i = 0;
    5352           2 :     if (((reinterpret_cast<uintptr_t>(pabySrc) |
    5353           2 :           reinterpret_cast<uintptr_t>(pabyDest0) |
    5354           2 :           reinterpret_cast<uintptr_t>(pabyDest1) |
    5355           2 :           reinterpret_cast<uintptr_t>(pabyDest2)) %
    5356             :          sizeof(unsigned int)) == 0)
    5357             :     {
    5358             :         // Slightly better than GCC autovectorizer
    5359          17 :         for (size_t j = 0; i + 3 < nIters; i += 4, ++j)
    5360             :         {
    5361          15 :             unsigned int word0 =
    5362          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i);
    5363          15 :             unsigned int word1 =
    5364          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 4);
    5365          15 :             unsigned int word2 =
    5366          15 :                 *reinterpret_cast<const unsigned int *>(pabySrc + 3 * i + 8);
    5367          15 :             reinterpret_cast<unsigned int *>(pabyDest0)[j] =
    5368          15 :                 (word0 & 0xff) | ((word0 >> 24) << 8) | (word1 & 0x00ff0000) |
    5369          15 :                 ((word2 >> 8) << 24);
    5370          15 :             reinterpret_cast<unsigned int *>(pabyDest1)[j] =
    5371          15 :                 ((word0 >> 8) & 0xff) | ((word1 & 0xff) << 8) |
    5372          15 :                 (((word1 >> 24)) << 16) | ((word2 >> 16) << 24);
    5373          15 :             pabyDest2[j * 4] = static_cast<GByte>(word0 >> 16);
    5374          15 :             pabyDest2[j * 4 + 1] = static_cast<GByte>(word1 >> 8);
    5375          15 :             pabyDest2[j * 4 + 2] = static_cast<GByte>(word2);
    5376          15 :             pabyDest2[j * 4 + 3] = static_cast<GByte>(word2 >> 24);
    5377             :         }
    5378             :     }
    5379             : #if defined(__clang__)
    5380             : #pragma clang loop vectorize(disable)
    5381             : #endif
    5382           3 :     for (; i < nIters; ++i)
    5383             :     {
    5384           1 :         pabyDest0[i] = pabySrc[3 * i + 0];
    5385           1 :         pabyDest1[i] = pabySrc[3 * i + 1];
    5386           1 :         pabyDest2[i] = pabySrc[3 * i + 2];
    5387             :     }
    5388             : }
    5389             : #endif
    5390             : 
    5391             : /************************************************************************/
    5392             : /*                    GDALDeinterleave4Byte()                           */
    5393             : /************************************************************************/
    5394             : 
    5395             : #if !defined(__GNUC__) || defined(__clang__)
    5396             : 
    5397             : /************************************************************************/
    5398             : /*                         deinterleave()                               */
    5399             : /************************************************************************/
    5400             : 
    5401             : template <bool SHIFT, bool MASK>
    5402             : inline __m128i deinterleave(__m128i &xmm0_ori, __m128i &xmm1_ori,
    5403             :                             __m128i &xmm2_ori, __m128i &xmm3_ori)
    5404             : {
    5405             :     // Set higher 24bit of each int32 packed word to 0
    5406             :     if (SHIFT)
    5407             :     {
    5408             :         xmm0_ori = _mm_srli_epi32(xmm0_ori, 8);
    5409             :         xmm1_ori = _mm_srli_epi32(xmm1_ori, 8);
    5410             :         xmm2_ori = _mm_srli_epi32(xmm2_ori, 8);
    5411             :         xmm3_ori = _mm_srli_epi32(xmm3_ori, 8);
    5412             :     }
    5413             :     __m128i xmm0;
    5414             :     __m128i xmm1;
    5415             :     __m128i xmm2;
    5416             :     __m128i xmm3;
    5417             :     if (MASK)
    5418             :     {
    5419             :         const __m128i xmm_mask = _mm_set1_epi32(0xff);
    5420             :         xmm0 = _mm_and_si128(xmm0_ori, xmm_mask);
    5421             :         xmm1 = _mm_and_si128(xmm1_ori, xmm_mask);
    5422             :         xmm2 = _mm_and_si128(xmm2_ori, xmm_mask);
    5423             :         xmm3 = _mm_and_si128(xmm3_ori, xmm_mask);
    5424             :     }
    5425             :     else
    5426             :     {
    5427             :         xmm0 = xmm0_ori;
    5428             :         xmm1 = xmm1_ori;
    5429             :         xmm2 = xmm2_ori;
    5430             :         xmm3 = xmm3_ori;
    5431             :     }
    5432             :     // Pack int32 to int16
    5433             :     xmm0 = _mm_packs_epi32(xmm0, xmm1);
    5434             :     xmm2 = _mm_packs_epi32(xmm2, xmm3);
    5435             :     // Pack int16 to uint8
    5436             :     xmm0 = _mm_packus_epi16(xmm0, xmm2);
    5437             :     return xmm0;
    5438             : }
    5439             : 
    5440             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5441             :                                   GByte *CPL_RESTRICT pabyDest0,
    5442             :                                   GByte *CPL_RESTRICT pabyDest1,
    5443             :                                   GByte *CPL_RESTRICT pabyDest2,
    5444             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5445             : #ifdef USE_NEON_OPTIMIZATIONS
    5446             : {
    5447             :     return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5448             :                                        pabyDest3, nIters);
    5449             : }
    5450             : #else
    5451             : {
    5452             : #ifdef HAVE_SSSE3_AT_COMPILE_TIME
    5453             :     if (CPLHaveRuntimeSSSE3())
    5454             :     {
    5455             :         return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1,
    5456             :                                            pabyDest2, pabyDest3, nIters);
    5457             :     }
    5458             : #endif
    5459             : 
    5460             :     // Not the optimal SSE2-only code, as gcc auto-vectorizer manages to
    5461             :     // do something slightly better.
    5462             :     size_t i = 0;
    5463             :     for (; i + 15 < nIters; i += 16)
    5464             :     {
    5465             :         __m128i xmm0_ori = _mm_loadu_si128(
    5466             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
    5467             :         __m128i xmm1_ori = _mm_loadu_si128(
    5468             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
    5469             :         __m128i xmm2_ori = _mm_loadu_si128(
    5470             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
    5471             :         __m128i xmm3_ori = _mm_loadu_si128(
    5472             :             reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
    5473             : 
    5474             :         _mm_storeu_si128(
    5475             :             reinterpret_cast<__m128i *>(pabyDest0 + i),
    5476             :             deinterleave<false, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5477             :         _mm_storeu_si128(
    5478             :             reinterpret_cast<__m128i *>(pabyDest1 + i),
    5479             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5480             :         _mm_storeu_si128(
    5481             :             reinterpret_cast<__m128i *>(pabyDest2 + i),
    5482             :             deinterleave<true, true>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5483             :         _mm_storeu_si128(
    5484             :             reinterpret_cast<__m128i *>(pabyDest3 + i),
    5485             :             deinterleave<true, false>(xmm0_ori, xmm1_ori, xmm2_ori, xmm3_ori));
    5486             :     }
    5487             : 
    5488             : #if defined(__clang__)
    5489             : #pragma clang loop vectorize(disable)
    5490             : #endif
    5491             :     for (; i < nIters; ++i)
    5492             :     {
    5493             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5494             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5495             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5496             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5497             :     }
    5498             : }
    5499             : #endif
    5500             : #else
    5501             : // GCC autovectorizer does an excellent job
    5502       53069 : __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
    5503             :     const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
    5504             :     GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2,
    5505             :     GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5506             : {
    5507   527000000 :     for (size_t i = 0; i < nIters; ++i)
    5508             :     {
    5509   526947000 :         pabyDest0[i] = pabySrc[4 * i + 0];
    5510   526947000 :         pabyDest1[i] = pabySrc[4 * i + 1];
    5511   526947000 :         pabyDest2[i] = pabySrc[4 * i + 2];
    5512   526947000 :         pabyDest3[i] = pabySrc[4 * i + 3];
    5513             :     }
    5514       53069 : }
    5515             : #endif
    5516             : 
    5517             : #else
    5518             : 
    5519             : /************************************************************************/
    5520             : /*                    GDALDeinterleave3Byte()                           */
    5521             : /************************************************************************/
    5522             : 
    5523             : // TODO: Enabling below could help on non-Intel architectures where GCC knows
    5524             : // how to auto-vectorize
    5525             : // #if defined(__GNUC__)
    5526             : //__attribute__((optimize("tree-vectorize")))
    5527             : // #endif
    5528             : static void GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
    5529             :                                   GByte *CPL_RESTRICT pabyDest0,
    5530             :                                   GByte *CPL_RESTRICT pabyDest1,
    5531             :                                   GByte *CPL_RESTRICT pabyDest2, size_t nIters)
    5532             : {
    5533             :     for (size_t i = 0; i < nIters; ++i)
    5534             :     {
    5535             :         pabyDest0[i] = pabySrc[3 * i + 0];
    5536             :         pabyDest1[i] = pabySrc[3 * i + 1];
    5537             :         pabyDest2[i] = pabySrc[3 * i + 2];
    5538             :     }
    5539             : }
    5540             : 
    5541             : /************************************************************************/
    5542             : /*                    GDALDeinterleave4Byte()                           */
    5543             : /************************************************************************/
    5544             : 
    5545             : // TODO: Enabling below could help on non-Intel architectures where gcc knows
    5546             : // how to auto-vectorize
    5547             : // #if defined(__GNUC__)
    5548             : //__attribute__((optimize("tree-vectorize")))
    5549             : // #endif
    5550             : static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
    5551             :                                   GByte *CPL_RESTRICT pabyDest0,
    5552             :                                   GByte *CPL_RESTRICT pabyDest1,
    5553             :                                   GByte *CPL_RESTRICT pabyDest2,
    5554             :                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
    5555             : {
    5556             :     for (size_t i = 0; i < nIters; ++i)
    5557             :     {
    5558             :         pabyDest0[i] = pabySrc[4 * i + 0];
    5559             :         pabyDest1[i] = pabySrc[4 * i + 1];
    5560             :         pabyDest2[i] = pabySrc[4 * i + 2];
    5561             :         pabyDest3[i] = pabySrc[4 * i + 3];
    5562             :     }
    5563             : }
    5564             : 
    5565             : #endif
    5566             : 
    5567             : /************************************************************************/
    5568             : /*                      GDALDeinterleave()                              */
    5569             : /************************************************************************/
    5570             : 
    5571             : /*! Copy values from a pixel-interleave buffer to multiple per-component
    5572             :     buffers.
    5573             : 
    5574             :     In pseudo-code
    5575             :     \verbatim
    5576             :     for(size_t i = 0; i < nIters; ++i)
    5577             :         for(int iComp = 0; iComp < nComponents; iComp++ )
    5578             :             ppDestBuffer[iComp][i] = pSourceBuffer[nComponents * i + iComp]
    5579             :     \endverbatim
    5580             : 
    5581             :     The implementation is optimized for a few cases, like de-interleaving
    5582             :     of 3 or 4-components Byte buffers.
    5583             : 
    5584             :     \since GDAL 3.6
    5585             :  */
    5586      123620 : void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
    5587             :                       int nComponents, void **ppDestBuffer,
    5588             :                       GDALDataType eDestDT, size_t nIters)
    5589             : {
    5590      123620 :     if (eSourceDT == eDestDT)
    5591             :     {
    5592      123597 :         if (eSourceDT == GDT_Byte || eSourceDT == GDT_Int8)
    5593             :         {
    5594      122864 :             if (nComponents == 3)
    5595             :             {
    5596       69769 :                 const GByte *CPL_RESTRICT pabySrc =
    5597             :                     static_cast<const GByte *>(pSourceBuffer);
    5598       69769 :                 GByte *CPL_RESTRICT pabyDest0 =
    5599             :                     static_cast<GByte *>(ppDestBuffer[0]);
    5600       69769 :                 GByte *CPL_RESTRICT pabyDest1 =
    5601             :                     static_cast<GByte *>(ppDestBuffer[1]);
    5602       69769 :                 GByte *CPL_RESTRICT pabyDest2 =
    5603             :                     static_cast<GByte *>(ppDestBuffer[2]);
    5604       69769 :                 GDALDeinterleave3Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5605             :                                       nIters);
    5606       69800 :                 return;
    5607             :             }
    5608       53095 :             else if (nComponents == 4)
    5609             :             {
    5610       53069 :                 const GByte *CPL_RESTRICT pabySrc =
    5611             :                     static_cast<const GByte *>(pSourceBuffer);
    5612       53069 :                 GByte *CPL_RESTRICT pabyDest0 =
    5613             :                     static_cast<GByte *>(ppDestBuffer[0]);
    5614       53069 :                 GByte *CPL_RESTRICT pabyDest1 =
    5615             :                     static_cast<GByte *>(ppDestBuffer[1]);
    5616       53069 :                 GByte *CPL_RESTRICT pabyDest2 =
    5617             :                     static_cast<GByte *>(ppDestBuffer[2]);
    5618       53069 :                 GByte *CPL_RESTRICT pabyDest3 =
    5619             :                     static_cast<GByte *>(ppDestBuffer[3]);
    5620       53069 :                 GDALDeinterleave4Byte(pabySrc, pabyDest0, pabyDest1, pabyDest2,
    5621             :                                       pabyDest3, nIters);
    5622       53069 :                 return;
    5623          26 :             }
    5624             :         }
    5625             : #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
    5626             :      defined(__INTEL_CLANG_COMPILER)) &&                                       \
    5627             :     defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
    5628        1466 :         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
    5629         733 :                  CPLHaveRuntimeSSSE3())
    5630             :         {
    5631         733 :             if (nComponents == 3)
    5632             :             {
    5633         239 :                 const GUInt16 *CPL_RESTRICT panSrc =
    5634             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    5635         239 :                 GUInt16 *CPL_RESTRICT panDest0 =
    5636             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    5637         239 :                 GUInt16 *CPL_RESTRICT panDest1 =
    5638             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    5639         239 :                 GUInt16 *CPL_RESTRICT panDest2 =
    5640             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    5641         239 :                 GDALDeinterleave3UInt16_SSSE3(panSrc, panDest0, panDest1,
    5642             :                                               panDest2, nIters);
    5643         239 :                 return;
    5644             :             }
    5645             : #if !defined(__INTEL_CLANG_COMPILER)
    5646             :             // ICC autovectorizer doesn't do a good job, at least with icx
    5647             :             // 2022.1.0.20220316
    5648         494 :             else if (nComponents == 4)
    5649             :             {
    5650         494 :                 const GUInt16 *CPL_RESTRICT panSrc =
    5651             :                     static_cast<const GUInt16 *>(pSourceBuffer);
    5652         494 :                 GUInt16 *CPL_RESTRICT panDest0 =
    5653             :                     static_cast<GUInt16 *>(ppDestBuffer[0]);
    5654         494 :                 GUInt16 *CPL_RESTRICT panDest1 =
    5655             :                     static_cast<GUInt16 *>(ppDestBuffer[1]);
    5656         494 :                 GUInt16 *CPL_RESTRICT panDest2 =
    5657             :                     static_cast<GUInt16 *>(ppDestBuffer[2]);
    5658         494 :                 GUInt16 *CPL_RESTRICT panDest3 =
    5659             :                     static_cast<GUInt16 *>(ppDestBuffer[3]);
    5660         494 :                 GDALDeinterleave4UInt16_SSSE3(panSrc, panDest0, panDest1,
    5661             :                                               panDest2, panDest3, nIters);
    5662         494 :                 return;
    5663             :             }
    5664             : #endif
    5665             :         }
    5666             : #endif
    5667             :     }
    5668             : 
    5669          49 :     const int nSourceDTSize = GDALGetDataTypeSizeBytes(eSourceDT);
    5670          22 :     const int nDestDTSize = GDALGetDataTypeSizeBytes(eDestDT);
    5671          87 :     for (int iComp = 0; iComp < nComponents; iComp++)
    5672             :     {
    5673          65 :         GDALCopyWords64(static_cast<const GByte *>(pSourceBuffer) +
    5674          65 :                             iComp * nSourceDTSize,
    5675             :                         eSourceDT, nComponents * nSourceDTSize,
    5676          65 :                         ppDestBuffer[iComp], eDestDT, nDestDTSize, nIters);
    5677             :     }
    5678             : }
    5679             : 
    5680             : /************************************************************************/
    5681             : /*                    GDALTranspose2DSingleToSingle()                   */
    5682             : /************************************************************************/
    5683             : /**
    5684             :  * Transpose a 2D array of non-complex values, in a efficient (cache-oblivious) way.
    5685             :  *
    5686             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    5687             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    5688             :  * @param nSrcWidth Width of pSrc array.
    5689             :  * @param nSrcHeight Height of pSrc array.
    5690             :  */
    5691             : 
    5692             : template <class DST, class SRC>
    5693         124 : void GDALTranspose2DSingleToSingle(const SRC *CPL_RESTRICT pSrc,
    5694             :                                    DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    5695             :                                    size_t nSrcHeight)
    5696             : {
    5697         124 :     constexpr size_t blocksize = 32;
    5698         273 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    5699             :     {
    5700         149 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    5701         348 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    5702             :         {
    5703             :             // transpose the block beginning at [i,j]
    5704         199 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    5705        2446 :             for (size_t k = i; k < max_k; ++k)
    5706             :             {
    5707       40849 :                 for (size_t l = j; l < max_l; ++l)
    5708             :                 {
    5709       38602 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    5710       38602 :                                  pDst[k + l * nSrcHeight]);
    5711             :                 }
    5712             :             }
    5713             :         }
    5714             :     }
    5715         124 : }
    5716             : 
    5717             : /************************************************************************/
    5718             : /*                   GDALTranspose2DComplexToComplex()                  */
    5719             : /************************************************************************/
    5720             : /**
    5721             :  * Transpose a 2D array of complex values into an array of complex values,
    5722             :  * in a efficient (cache-oblivious) way.
    5723             :  *
    5724             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    5725             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    5726             :  * @param nSrcWidth Width of pSrc array.
    5727             :  * @param nSrcHeight Height of pSrc array.
    5728             :  */
    5729             : template <class DST, class SRC>
    5730          16 : void GDALTranspose2DComplexToComplex(const SRC *CPL_RESTRICT pSrc,
    5731             :                                      DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    5732             :                                      size_t nSrcHeight)
    5733             : {
    5734          16 :     constexpr size_t blocksize = 32;
    5735          32 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    5736             :     {
    5737          16 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    5738          32 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    5739             :         {
    5740             :             // transpose the block beginning at [i,j]
    5741          16 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    5742          48 :             for (size_t k = i; k < max_k; ++k)
    5743             :             {
    5744         128 :                 for (size_t l = j; l < max_l; ++l)
    5745             :                 {
    5746          96 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    5747          96 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    5748          96 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 1],
    5749          96 :                                  pDst[2 * (k + l * nSrcHeight) + 1]);
    5750             :                 }
    5751             :             }
    5752             :         }
    5753             :     }
    5754          16 : }
    5755             : 
    5756             : /************************************************************************/
    5757             : /*                   GDALTranspose2DComplexToSingle()                  */
    5758             : /************************************************************************/
    5759             : /**
    5760             :  * Transpose a 2D array of complex values into an array of non-complex values,
    5761             :  * in a efficient (cache-oblivious) way.
    5762             :  *
    5763             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    5764             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    5765             :  * @param nSrcWidth Width of pSrc array.
    5766             :  * @param nSrcHeight Height of pSrc array.
    5767             :  */
    5768             : template <class DST, class SRC>
    5769          40 : void GDALTranspose2DComplexToSingle(const SRC *CPL_RESTRICT pSrc,
    5770             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    5771             :                                     size_t nSrcHeight)
    5772             : {
    5773          40 :     constexpr size_t blocksize = 32;
    5774          80 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    5775             :     {
    5776          40 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    5777          80 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    5778             :         {
    5779             :             // transpose the block beginning at [i,j]
    5780          40 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    5781         120 :             for (size_t k = i; k < max_k; ++k)
    5782             :             {
    5783         320 :                 for (size_t l = j; l < max_l; ++l)
    5784             :                 {
    5785         240 :                     GDALCopyWord(pSrc[2 * (l + k * nSrcWidth) + 0],
    5786         240 :                                  pDst[k + l * nSrcHeight]);
    5787             :                 }
    5788             :             }
    5789             :         }
    5790             :     }
    5791          40 : }
    5792             : 
    5793             : /************************************************************************/
    5794             : /*                   GDALTranspose2DSingleToComplex()                  */
    5795             : /************************************************************************/
    5796             : /**
    5797             :  * Transpose a 2D array of non-complex values into an array of complex values,
    5798             :  * in a efficient (cache-oblivious) way.
    5799             :  *
    5800             :  * @param pSrc Source array of height = nSrcHeight and width = nSrcWidth.
    5801             :  * @param pDst Destination transposed array of height = nSrcWidth and width = nSrcHeight.
    5802             :  * @param nSrcWidth Width of pSrc array.
    5803             :  * @param nSrcHeight Height of pSrc array.
    5804             :  */
    5805             : template <class DST, class SRC>
    5806          40 : void GDALTranspose2DSingleToComplex(const SRC *CPL_RESTRICT pSrc,
    5807             :                                     DST *CPL_RESTRICT pDst, size_t nSrcWidth,
    5808             :                                     size_t nSrcHeight)
    5809             : {
    5810          40 :     constexpr size_t blocksize = 32;
    5811          80 :     for (size_t i = 0; i < nSrcHeight; i += blocksize)
    5812             :     {
    5813          40 :         const size_t max_k = std::min(i + blocksize, nSrcHeight);
    5814          80 :         for (size_t j = 0; j < nSrcWidth; j += blocksize)
    5815             :         {
    5816             :             // transpose the block beginning at [i,j]
    5817          40 :             const size_t max_l = std::min(j + blocksize, nSrcWidth);
    5818         120 :             for (size_t k = i; k < max_k; ++k)
    5819             :             {
    5820         320 :                 for (size_t l = j; l < max_l; ++l)
    5821             :                 {
    5822         240 :                     GDALCopyWord(pSrc[l + k * nSrcWidth],
    5823         240 :                                  pDst[2 * (k + l * nSrcHeight) + 0]);
    5824         240 :                     pDst[2 * (k + l * nSrcHeight) + 1] = 0;
    5825             :                 }
    5826             :             }
    5827             :         }
    5828             :     }
    5829          40 : }
    5830             : 
    5831             : /************************************************************************/
    5832             : /*                        GDALTranspose2D()                             */
    5833             : /************************************************************************/
    5834             : 
    5835             : template <class DST, bool DST_IS_COMPLEX>
    5836         220 : static void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, DST *pDst,
    5837             :                             size_t nSrcWidth, size_t nSrcHeight)
    5838             : {
    5839             : #define CALL_GDALTranspose2D_internal(SRC_TYPE)                                \
    5840             :     do                                                                         \
    5841             :     {                                                                          \
    5842             :         if constexpr (DST_IS_COMPLEX)                                          \
    5843             :         {                                                                      \
    5844             :             GDALTranspose2DSingleToComplex(                                    \
    5845             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    5846             :                 nSrcHeight);                                                   \
    5847             :         }                                                                      \
    5848             :         else                                                                   \
    5849             :         {                                                                      \
    5850             :             GDALTranspose2DSingleToSingle(static_cast<const SRC_TYPE *>(pSrc), \
    5851             :                                           pDst, nSrcWidth, nSrcHeight);        \
    5852             :         }                                                                      \
    5853             :     } while (0)
    5854             : 
    5855             : #define CALL_GDALTranspose2DComplex_internal(SRC_TYPE)                         \
    5856             :     do                                                                         \
    5857             :     {                                                                          \
    5858             :         if constexpr (DST_IS_COMPLEX)                                          \
    5859             :         {                                                                      \
    5860             :             GDALTranspose2DComplexToComplex(                                   \
    5861             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    5862             :                 nSrcHeight);                                                   \
    5863             :         }                                                                      \
    5864             :         else                                                                   \
    5865             :         {                                                                      \
    5866             :             GDALTranspose2DComplexToSingle(                                    \
    5867             :                 static_cast<const SRC_TYPE *>(pSrc), pDst, nSrcWidth,          \
    5868             :                 nSrcHeight);                                                   \
    5869             :         }                                                                      \
    5870             :     } while (0)
    5871             : 
    5872             :     // clang-format off
    5873         220 :     switch (eSrcType)
    5874             :     {
    5875          14 :         case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t); break;
    5876          13 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t); break;
    5877          22 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t); break;
    5878          14 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t); break;
    5879          22 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t); break;
    5880          14 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t); break;
    5881          14 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t); break;
    5882          14 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t); break;
    5883          15 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float); break;
    5884          22 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double); break;
    5885          14 :         case GDT_CInt16:   CALL_GDALTranspose2DComplex_internal(int16_t); break;
    5886          14 :         case GDT_CInt32:   CALL_GDALTranspose2DComplex_internal(int32_t); break;
    5887          14 :         case GDT_CFloat32: CALL_GDALTranspose2DComplex_internal(float); break;
    5888          14 :         case GDT_CFloat64: CALL_GDALTranspose2DComplex_internal(double); break;
    5889           0 :         case GDT_Unknown:
    5890             :         case GDT_TypeCount:
    5891           0 :             break;
    5892             :     }
    5893             :         // clang-format on
    5894             : 
    5895             : #undef CALL_GDALTranspose2D_internal
    5896             : #undef CALL_GDALTranspose2DComplex_internal
    5897         220 : }
    5898             : 
    5899             : /************************************************************************/
    5900             : /*                      GDALInterleave2Byte()                           */
    5901             : /************************************************************************/
    5902             : 
    5903             : #if defined(HAVE_SSE2) &&                                                      \
    5904             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    5905             : 
    5906             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    5907             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    5908             : #if defined(__GNUC__)
    5909             : __attribute__((noinline))
    5910             : #endif
    5911             : static void
    5912             : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    5913             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    5914             : {
    5915             :     size_t i = 0;
    5916             :     constexpr size_t VALS_PER_ITER = 16;
    5917             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    5918             :     {
    5919             :         __m128i xmm0 =
    5920             :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + i));
    5921             :         __m128i xmm1 = _mm_loadu_si128(
    5922             :             reinterpret_cast<__m128i const *>(pSrc + i + nIters));
    5923             :         _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + 2 * i),
    5924             :                          _mm_unpacklo_epi8(xmm0, xmm1));
    5925             :         _mm_storeu_si128(
    5926             :             reinterpret_cast<__m128i *>(pDst + 2 * i + VALS_PER_ITER),
    5927             :             _mm_unpackhi_epi8(xmm0, xmm1));
    5928             :     }
    5929             : #if defined(__clang__)
    5930             : #pragma clang loop vectorize(disable)
    5931             : #endif
    5932             :     for (; i < nIters; ++i)
    5933             :     {
    5934             :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    5935             :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    5936             :     }
    5937             : }
    5938             : 
    5939             : #else
    5940             : 
    5941             : #if defined(__GNUC__) && !defined(__clang__)
    5942             : __attribute__((optimize("tree-vectorize")))
    5943             : #endif
    5944             : #if defined(__GNUC__)
    5945             : __attribute__((noinline))
    5946             : #endif
    5947             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    5948             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    5949             : #pragma clang diagnostic push
    5950             : #pragma clang diagnostic ignored "-Wpass-failed"
    5951             : #endif
    5952             : static void
    5953           4 : GDALInterleave2Byte(const uint8_t *CPL_RESTRICT pSrc,
    5954             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    5955             : {
    5956             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    5957             : #pragma clang loop vectorize(enable)
    5958             : #endif
    5959          44 :     for (size_t i = 0; i < nIters; ++i)
    5960             :     {
    5961          40 :         pDst[2 * i + 0] = pSrc[i + 0 * nIters];
    5962          40 :         pDst[2 * i + 1] = pSrc[i + 1 * nIters];
    5963             :     }
    5964           4 : }
    5965             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    5966             : #pragma clang diagnostic pop
    5967             : #endif
    5968             : 
    5969             : #endif
    5970             : 
    5971             : /************************************************************************/
    5972             : /*                      GDALInterleave4Byte()                           */
    5973             : /************************************************************************/
    5974             : 
    5975             : #if defined(HAVE_SSE2) &&                                                      \
    5976             :     (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
    5977             : 
    5978             : // ICC autovectorizer doesn't do a good job at generating good SSE code,
    5979             : // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
    5980             : #if defined(__GNUC__)
    5981             : __attribute__((noinline))
    5982             : #endif
    5983             : static void
    5984             : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    5985             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    5986             : {
    5987             :     size_t i = 0;
    5988             :     constexpr size_t VALS_PER_ITER = 16;
    5989             :     for (i = 0; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
    5990             :     {
    5991             :         __m128i xmm0 = _mm_loadu_si128(
    5992             :             reinterpret_cast<__m128i const *>(pSrc + i + 0 * nIters));
    5993             :         __m128i xmm1 = _mm_loadu_si128(
    5994             :             reinterpret_cast<__m128i const *>(pSrc + i + 1 * nIters));
    5995             :         __m128i xmm2 = _mm_loadu_si128(
    5996             :             reinterpret_cast<__m128i const *>(pSrc + i + 2 * nIters));
    5997             :         __m128i xmm3 = _mm_loadu_si128(
    5998             :             reinterpret_cast<__m128i const *>(pSrc + i + 3 * nIters));
    5999             :         auto tmp0 = _mm_unpacklo_epi8(
    6000             :             xmm0,
    6001             :             xmm1);  // (xmm0_0, xmm1_0, xmm0_1, xmm1_1, xmm0_2, xmm1_2, ...)
    6002             :         auto tmp1 = _mm_unpackhi_epi8(
    6003             :             xmm0,
    6004             :             xmm1);  // (xmm0_8, xmm1_8, xmm0_9, xmm1_9, xmm0_10, xmm1_10, ...)
    6005             :         auto tmp2 = _mm_unpacklo_epi8(
    6006             :             xmm2,
    6007             :             xmm3);  // (xmm2_0, xmm3_0, xmm2_1, xmm3_1, xmm2_2, xmm3_2, ...)
    6008             :         auto tmp3 = _mm_unpackhi_epi8(
    6009             :             xmm2,
    6010             :             xmm3);  // (xmm2_8, xmm3_8, xmm2_9, xmm3_9, xmm2_10, xmm3_10, ...)
    6011             :         auto tmp2_0 = _mm_unpacklo_epi16(
    6012             :             tmp0,
    6013             :             tmp2);  // (xmm0_0, xmm1_0, xmm2_0, xmm3_0, xmm0_1, xmm1_1, xmm2_1, xmm3_1, ...)
    6014             :         auto tmp2_1 = _mm_unpackhi_epi16(tmp0, tmp2);
    6015             :         auto tmp2_2 = _mm_unpacklo_epi16(tmp1, tmp3);
    6016             :         auto tmp2_3 = _mm_unpackhi_epi16(tmp1, tmp3);
    6017             :         _mm_storeu_si128(
    6018             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 0 * VALS_PER_ITER),
    6019             :             tmp2_0);
    6020             :         _mm_storeu_si128(
    6021             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 1 * VALS_PER_ITER),
    6022             :             tmp2_1);
    6023             :         _mm_storeu_si128(
    6024             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 2 * VALS_PER_ITER),
    6025             :             tmp2_2);
    6026             :         _mm_storeu_si128(
    6027             :             reinterpret_cast<__m128i *>(pDst + 4 * i + 3 * VALS_PER_ITER),
    6028             :             tmp2_3);
    6029             :     }
    6030             : #if defined(__clang__)
    6031             : #pragma clang loop vectorize(disable)
    6032             : #endif
    6033             :     for (; i < nIters; ++i)
    6034             :     {
    6035             :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    6036             :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    6037             :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    6038             :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    6039             :     }
    6040             : }
    6041             : 
    6042             : #else
    6043             : 
    6044             : #if defined(__GNUC__) && !defined(__clang__)
    6045             : __attribute__((optimize("tree-vectorize")))
    6046             : #endif
    6047             : #if defined(__GNUC__)
    6048             : __attribute__((noinline))
    6049             : #endif
    6050             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6051             : // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
    6052             : #pragma clang diagnostic push
    6053             : #pragma clang diagnostic ignored "-Wpass-failed"
    6054             : #endif
    6055             : static void
    6056           2 : GDALInterleave4Byte(const uint8_t *CPL_RESTRICT pSrc,
    6057             :                     uint8_t *CPL_RESTRICT pDst, size_t nIters)
    6058             : {
    6059             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6060             : #pragma clang loop vectorize(enable)
    6061             : #endif
    6062          36 :     for (size_t i = 0; i < nIters; ++i)
    6063             :     {
    6064          34 :         pDst[4 * i + 0] = pSrc[i + 0 * nIters];
    6065          34 :         pDst[4 * i + 1] = pSrc[i + 1 * nIters];
    6066          34 :         pDst[4 * i + 2] = pSrc[i + 2 * nIters];
    6067          34 :         pDst[4 * i + 3] = pSrc[i + 3 * nIters];
    6068             :     }
    6069           2 : }
    6070             : #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
    6071             : #pragma clang diagnostic pop
    6072             : #endif
    6073             : 
    6074             : #endif
    6075             : 
    6076             : /************************************************************************/
    6077             : /*                        GDALTranspose2D()                             */
    6078             : /************************************************************************/
    6079             : 
    6080             : /**
    6081             :  * Transpose a 2D array in a efficient (cache-oblivious) way.
    6082             :  *
    6083             :  * @param pSrc Source array of width = nSrcWidth and height = nSrcHeight.
    6084             :  * @param eSrcType Data type of pSrc.
    6085             :  * @param pDst Destination transposed array of width = nSrcHeight and height = nSrcWidth.
    6086             :  * @param eDstType Data type of pDst.
    6087             :  * @param nSrcWidth Width of pSrc array.
    6088             :  * @param nSrcHeight Height of pSrc array.
    6089             :  * @since GDAL 3.11
    6090             :  */
    6091             : 
    6092         245 : void GDALTranspose2D(const void *pSrc, GDALDataType eSrcType, void *pDst,
    6093             :                      GDALDataType eDstType, size_t nSrcWidth, size_t nSrcHeight)
    6094             : {
    6095         245 :     if (eSrcType == eDstType && (eSrcType == GDT_Byte || eSrcType == GDT_Int8))
    6096             :     {
    6097          25 :         if (nSrcHeight == 2)
    6098             :         {
    6099           4 :             GDALInterleave2Byte(static_cast<const uint8_t *>(pSrc),
    6100             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    6101           4 :             return;
    6102             :         }
    6103          21 :         if (nSrcHeight == 4)
    6104             :         {
    6105           2 :             GDALInterleave4Byte(static_cast<const uint8_t *>(pSrc),
    6106             :                                 static_cast<uint8_t *>(pDst), nSrcWidth);
    6107           2 :             return;
    6108             :         }
    6109             : #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
    6110             :      (defined(__x86_64) || defined(_M_X64)))
    6111          19 :         if (CPLHaveRuntimeSSSE3())
    6112             :         {
    6113          19 :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    6114             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    6115             :                                        nSrcHeight);
    6116          19 :             return;
    6117             :         }
    6118             : #elif defined(USE_NEON_OPTIMIZATIONS)
    6119             :         {
    6120             :             GDALTranspose2D_Byte_SSSE3(static_cast<const uint8_t *>(pSrc),
    6121             :                                        static_cast<uint8_t *>(pDst), nSrcWidth,
    6122             :                                        nSrcHeight);
    6123             :             return;
    6124             :         }
    6125             : #endif
    6126             :     }
    6127             : 
    6128             : #define CALL_GDALTranspose2D_internal(DST_TYPE, DST_IS_COMPLEX)                \
    6129             :     GDALTranspose2D<DST_TYPE, DST_IS_COMPLEX>(                                 \
    6130             :         pSrc, eSrcType, static_cast<DST_TYPE *>(pDst), nSrcWidth, nSrcHeight)
    6131             : 
    6132             :     // clang-format off
    6133         220 :     switch (eDstType)
    6134             :     {
    6135          13 :         case GDT_Byte:     CALL_GDALTranspose2D_internal(uint8_t, false); break;
    6136          13 :         case GDT_Int8:     CALL_GDALTranspose2D_internal(int8_t, false); break;
    6137          22 :         case GDT_UInt16:   CALL_GDALTranspose2D_internal(uint16_t, false); break;
    6138          14 :         case GDT_Int16:    CALL_GDALTranspose2D_internal(int16_t, false); break;
    6139          22 :         case GDT_UInt32:   CALL_GDALTranspose2D_internal(uint32_t, false); break;
    6140          14 :         case GDT_Int32:    CALL_GDALTranspose2D_internal(int32_t, false); break;
    6141          14 :         case GDT_UInt64:   CALL_GDALTranspose2D_internal(uint64_t, false); break;
    6142          14 :         case GDT_Int64:    CALL_GDALTranspose2D_internal(int64_t, false); break;
    6143          15 :         case GDT_Float32:  CALL_GDALTranspose2D_internal(float, false); break;
    6144          23 :         case GDT_Float64:  CALL_GDALTranspose2D_internal(double, false); break;
    6145          14 :         case GDT_CInt16:   CALL_GDALTranspose2D_internal(int16_t, true); break;
    6146          14 :         case GDT_CInt32:   CALL_GDALTranspose2D_internal(int32_t, true); break;
    6147          14 :         case GDT_CFloat32: CALL_GDALTranspose2D_internal(float, true); break;
    6148          14 :         case GDT_CFloat64: CALL_GDALTranspose2D_internal(double, true); break;
    6149           0 :         case GDT_Unknown:
    6150             :         case GDT_TypeCount:
    6151           0 :             break;
    6152             :     }
    6153             :         // clang-format on
    6154             : 
    6155             : #undef CALL_GDALTranspose2D_internal
    6156             : }
    6157             : 
    6158             : /************************************************************************/
    6159             : /*                     ExtractBitAndConvertTo255()                      */
    6160             : /************************************************************************/
    6161             : 
    6162             : #if defined(__GNUC__) || defined(_MSC_VER)
    6163             : // Signedness of char implementation dependent, so be explicit.
    6164             : // Assumes 2-complement integer types and sign extension of right shifting
    6165             : // GCC guarantees such:
    6166             : // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
    6167       95050 : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    6168             : {
    6169       95050 :     return static_cast<GByte>(static_cast<signed char>(byVal << (7 - nBit)) >>
    6170       95050 :                               7);
    6171             : }
    6172             : #else
    6173             : // Portable way
    6174             : static inline GByte ExtractBitAndConvertTo255(GByte byVal, int nBit)
    6175             : {
    6176             :     return (byVal & (1 << nBit)) ? 255 : 0;
    6177             : }
    6178             : #endif
    6179             : 
    6180             : /************************************************************************/
    6181             : /*                   ExpandEightPackedBitsToByteAt255()                 */
    6182             : /************************************************************************/
    6183             : 
    6184       11697 : static inline void ExpandEightPackedBitsToByteAt255(GByte byVal,
    6185             :                                                     GByte abyOutput[8])
    6186             : {
    6187       11697 :     abyOutput[0] = ExtractBitAndConvertTo255(byVal, 7);
    6188       11697 :     abyOutput[1] = ExtractBitAndConvertTo255(byVal, 6);
    6189       11697 :     abyOutput[2] = ExtractBitAndConvertTo255(byVal, 5);
    6190       11697 :     abyOutput[3] = ExtractBitAndConvertTo255(byVal, 4);
    6191       11697 :     abyOutput[4] = ExtractBitAndConvertTo255(byVal, 3);
    6192       11697 :     abyOutput[5] = ExtractBitAndConvertTo255(byVal, 2);
    6193       11697 :     abyOutput[6] = ExtractBitAndConvertTo255(byVal, 1);
    6194       11697 :     abyOutput[7] = ExtractBitAndConvertTo255(byVal, 0);
    6195       11697 : }
    6196             : 
    6197             : /************************************************************************/
    6198             : /*                GDALExpandPackedBitsToByteAt0Or255()                  */
    6199             : /************************************************************************/
    6200             : 
    6201             : /** Expand packed-bits (ordered from most-significant bit to least one)
    6202             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    6203             :   at 1 to a byte at 255.
    6204             : 
    6205             :  The function does (in a possibly more optimized way) the following:
    6206             :  \code{.cpp}
    6207             :  for (size_t i = 0; i < nInputBits; ++i )
    6208             :  {
    6209             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 255 : 0;
    6210             :  }
    6211             :  \endcode
    6212             : 
    6213             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    6214             :  @param pabyOutput Output array of nInputBits bytes.
    6215             :  @param nInputBits Number of valid bits in pabyInput.
    6216             : 
    6217             :  @since 3.11
    6218             : */
    6219             : 
    6220       30937 : void GDALExpandPackedBitsToByteAt0Or255(const GByte *CPL_RESTRICT pabyInput,
    6221             :                                         GByte *CPL_RESTRICT pabyOutput,
    6222             :                                         size_t nInputBits)
    6223             : {
    6224       30937 :     const size_t nInputWholeBytes = nInputBits / 8;
    6225       30937 :     size_t iByte = 0;
    6226             : 
    6227             : #ifdef HAVE_SSE2
    6228             :     // Mask to isolate each bit
    6229       30937 :     const __m128i bit_mask = _mm_set_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4,
    6230             :                                           8, 16, 32, 64, -128);
    6231       30937 :     const __m128i zero = _mm_setzero_si128();
    6232       30937 :     const __m128i all_ones = _mm_set1_epi8(-1);
    6233             : #ifdef __SSSE3__
    6234             :     const __m128i dispatch_two_bytes =
    6235             :         _mm_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
    6236             : #endif
    6237       30937 :     constexpr size_t SSE_REG_SIZE = sizeof(bit_mask);
    6238       79750 :     for (; iByte + SSE_REG_SIZE <= nInputWholeBytes; iByte += SSE_REG_SIZE)
    6239             :     {
    6240       48813 :         __m128i reg_ori = _mm_loadu_si128(
    6241       48813 :             reinterpret_cast<const __m128i *>(pabyInput + iByte));
    6242             : 
    6243       48813 :         constexpr int NUM_PROCESSED_BYTES_PER_REG = 2;
    6244      439317 :         for (size_t k = 0; k < SSE_REG_SIZE / NUM_PROCESSED_BYTES_PER_REG; ++k)
    6245             :         {
    6246             :             // Given reg_ori = (A, B, ... 14 other bytes ...),
    6247             :             // expand to (A, A, A, A, A, A, A, A, B, B, B, B, B, B, B, B)
    6248             : #ifdef __SSSE3__
    6249             :             __m128i reg = _mm_shuffle_epi8(reg_ori, dispatch_two_bytes);
    6250             : #else
    6251      390504 :             __m128i reg = _mm_unpacklo_epi8(reg_ori, reg_ori);
    6252      390504 :             reg = _mm_unpacklo_epi16(reg, reg);
    6253      390504 :             reg = _mm_unpacklo_epi32(reg, reg);
    6254             : #endif
    6255             : 
    6256             :             // Test if bits of interest are set
    6257      390504 :             reg = _mm_and_si128(reg, bit_mask);
    6258             : 
    6259             :             // Now test if those bits are set, by comparing to zero. So the
    6260             :             // result will be that bytes where bits are set will be at 0, and
    6261             :             // ones where they are cleared will be at 0xFF. So the inverse of
    6262             :             // the end result we want!
    6263      390504 :             reg = _mm_cmpeq_epi8(reg, zero);
    6264             : 
    6265             :             // Invert the result
    6266      390504 :             reg = _mm_andnot_si128(reg, all_ones);
    6267             : 
    6268             :             _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyOutput), reg);
    6269             : 
    6270      390504 :             pabyOutput += SSE_REG_SIZE;
    6271             : 
    6272             :             // Right-shift of 2 bytes
    6273      390504 :             reg_ori = _mm_bsrli_si128(reg_ori, NUM_PROCESSED_BYTES_PER_REG);
    6274             :         }
    6275             :     }
    6276             : 
    6277             : #endif  // HAVE_SSE2
    6278             : 
    6279       42634 :     for (; iByte < nInputWholeBytes; ++iByte)
    6280             :     {
    6281       11697 :         ExpandEightPackedBitsToByteAt255(pabyInput[iByte], pabyOutput);
    6282       11697 :         pabyOutput += 8;
    6283             :     }
    6284       32411 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    6285             :     {
    6286        1474 :         *pabyOutput = ExtractBitAndConvertTo255(pabyInput[iByte], 7 - iBit);
    6287        1474 :         ++pabyOutput;
    6288             :     }
    6289       30937 : }
    6290             : 
    6291             : /************************************************************************/
    6292             : /*                   ExpandEightPackedBitsToByteAt1()                   */
    6293             : /************************************************************************/
    6294             : 
    6295      136113 : static inline void ExpandEightPackedBitsToByteAt1(GByte byVal,
    6296             :                                                   GByte abyOutput[8])
    6297             : {
    6298      136113 :     abyOutput[0] = (byVal >> 7) & 0x1;
    6299      136113 :     abyOutput[1] = (byVal >> 6) & 0x1;
    6300      136113 :     abyOutput[2] = (byVal >> 5) & 0x1;
    6301      136113 :     abyOutput[3] = (byVal >> 4) & 0x1;
    6302      136113 :     abyOutput[4] = (byVal >> 3) & 0x1;
    6303      136113 :     abyOutput[5] = (byVal >> 2) & 0x1;
    6304      136113 :     abyOutput[6] = (byVal >> 1) & 0x1;
    6305      136113 :     abyOutput[7] = (byVal >> 0) & 0x1;
    6306      136113 : }
    6307             : 
    6308             : /************************************************************************/
    6309             : /*                GDALExpandPackedBitsToByteAt0Or1()                    */
    6310             : /************************************************************************/
    6311             : 
    6312             : /** Expand packed-bits (ordered from most-significant bit to least one)
    6313             :   into a byte each, where a bit at 0 is expanded to a byte at 0, and a bit
    6314             :   at 1 to a byte at 1.
    6315             : 
    6316             :  The function does (in a possibly more optimized way) the following:
    6317             :  \code{.cpp}
    6318             :  for (size_t i = 0; i < nInputBits; ++i )
    6319             :  {
    6320             :      pabyOutput[i] = (pabyInput[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
    6321             :  }
    6322             :  \endcode
    6323             : 
    6324             :  @param pabyInput Input array of (nInputBits + 7) / 8 bytes.
    6325             :  @param pabyOutput Output array of nInputBits bytes.
    6326             :  @param nInputBits Number of valid bits in pabyInput.
    6327             : 
    6328             :  @since 3.11
    6329             : */
    6330             : 
    6331        7041 : void GDALExpandPackedBitsToByteAt0Or1(const GByte *CPL_RESTRICT pabyInput,
    6332             :                                       GByte *CPL_RESTRICT pabyOutput,
    6333             :                                       size_t nInputBits)
    6334             : {
    6335        7041 :     const size_t nInputWholeBytes = nInputBits / 8;
    6336        7041 :     size_t iByte = 0;
    6337      143154 :     for (; iByte < nInputWholeBytes; ++iByte)
    6338             :     {
    6339      136113 :         ExpandEightPackedBitsToByteAt1(pabyInput[iByte], pabyOutput);
    6340      136113 :         pabyOutput += 8;
    6341             :     }
    6342       18902 :     for (int iBit = 0; iBit < static_cast<int>(nInputBits % 8); ++iBit)
    6343             :     {
    6344       11861 :         *pabyOutput = (pabyInput[iByte] >> (7 - iBit)) & 0x1;
    6345       11861 :         ++pabyOutput;
    6346             :     }
    6347        7041 : }

Generated by: LCOV version 1.14