LCOV - code coverage report
Current view: top level - gcore - overview.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2767 3148 87.9 %
Date: 2026-06-23 23:27:26 Functions: 174 191 91.1 %

          Line data    Source code
       1             : 
       2             : /******************************************************************************
       3             :  *
       4             :  * Project:  GDAL Core
       5             :  * Purpose:  Helper code to implement overview support in different drivers.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2000, Frank Warmerdam
      10             :  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal_priv.h"
      17             : 
      18             : #include <cmath>
      19             : #include <cstddef>
      20             : #include <cstdlib>
      21             : 
      22             : #include <algorithm>
      23             : #include <complex>
      24             : #include <condition_variable>
      25             : #include <limits>
      26             : #include <list>
      27             : #include <memory>
      28             : #include <mutex>
      29             : #include <vector>
      30             : 
      31             : #include "cpl_conv.h"
      32             : #include "cpl_error.h"
      33             : #include "cpl_float.h"
      34             : #include "cpl_progress.h"
      35             : #include "cpl_vsi.h"
      36             : #include "cpl_worker_thread_pool.h"
      37             : #include "gdal.h"
      38             : #include "gdal_thread_pool.h"
      39             : #include "gdalwarper.h"
      40             : #include "gdal_vrt.h"
      41             : #include "vrtdataset.h"
      42             : 
      43             : #ifdef USE_NEON_OPTIMIZATIONS
      44             : #include "include_sse2neon.h"
      45             : 
      46             : #if (!defined(__aarch64__) && !defined(_M_ARM64))
      47             : #define ARM_V7
      48             : #endif
      49             : 
      50             : #define USE_SSE2
      51             : 
      52             : #include "gdalsse_priv.h"
      53             : 
      54             : // Restrict to 64bit processors because they are guaranteed to have SSE2,
      55             : // or if __AVX2__ is defined.
      56             : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
      57             : #define USE_SSE2
      58             : 
      59             : #include "gdalsse_priv.h"
      60             : 
      61             : #ifdef __SSE3__
      62             : #include <pmmintrin.h>
      63             : #endif
      64             : #ifdef __SSSE3__
      65             : #include <tmmintrin.h>
      66             : #endif
      67             : #ifdef __SSE4_1__
      68             : #include <smmintrin.h>
      69             : #endif
      70             : #ifdef __AVX2__
      71             : #include <immintrin.h>
      72             : #endif
      73             : 
      74             : #endif
      75             : 
      76             : // To be included after above USE_SSE2 and include gdalsse_priv.h
      77             : // to avoid build issue on Windows x86
      78             : #include "gdal_priv_templates.hpp"
      79             : 
      80             : /************************************************************************/
      81             : /*                       GDALResampleChunk_Near()                       */
      82             : /************************************************************************/
      83             : 
      84             : template <class T>
      85        1266 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
      86             :                                       const T *pChunk, T **ppDstBuffer)
      87             : 
      88             : {
      89        1266 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
      90        1266 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
      91        1266 :     const GDALDataType eWrkDataType = args.eWrkDataType;
      92        1266 :     const int nChunkXOff = args.nChunkXOff;
      93        1266 :     const int nChunkXSize = args.nChunkXSize;
      94        1266 :     const int nChunkYOff = args.nChunkYOff;
      95        1266 :     const int nDstXOff = args.nDstXOff;
      96        1266 :     const int nDstXOff2 = args.nDstXOff2;
      97        1266 :     const int nDstYOff = args.nDstYOff;
      98        1266 :     const int nDstYOff2 = args.nDstYOff2;
      99        1266 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
     100             : 
     101             :     /* -------------------------------------------------------------------- */
     102             :     /*      Allocate buffers.                                               */
     103             :     /* -------------------------------------------------------------------- */
     104        1266 :     *ppDstBuffer = static_cast<T *>(
     105        1266 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
     106             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
     107        1266 :     if (*ppDstBuffer == nullptr)
     108             :     {
     109           0 :         return CE_Failure;
     110             :     }
     111        1266 :     T *const pDstBuffer = *ppDstBuffer;
     112             : 
     113             :     int *panSrcXOff =
     114        1266 :         static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
     115             : 
     116        1266 :     if (panSrcXOff == nullptr)
     117             :     {
     118           0 :         return CE_Failure;
     119             :     }
     120             : 
     121             :     /* ==================================================================== */
     122             :     /*      Precompute inner loop constants.                                */
     123             :     /* ==================================================================== */
     124      840896 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
     125             :     {
     126      839630 :         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
     127      839630 :         if (nSrcXOff < nChunkXOff)
     128           0 :             nSrcXOff = nChunkXOff;
     129             : 
     130      839630 :         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
     131             :     }
     132             : 
     133             :     /* ==================================================================== */
     134             :     /*      Loop over destination scanlines.                                */
     135             :     /* ==================================================================== */
     136      142457 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
     137             :     {
     138      141191 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
     139      141191 :         if (nSrcYOff < nChunkYOff)
     140           0 :             nSrcYOff = nChunkYOff;
     141             : 
     142      141191 :         const T *const pSrcScanline =
     143             :             pChunk +
     144      141191 :             (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
     145      137798 :             nChunkXOff;
     146             : 
     147             :         /* --------------------------------------------------------------------
     148             :          */
     149             :         /*      Loop over destination pixels */
     150             :         /* --------------------------------------------------------------------
     151             :          */
     152      141191 :         T *pDstScanline =
     153      141191 :             pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
     154   120247393 :         for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
     155             :         {
     156   120106000 :             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
     157             :         }
     158             :     }
     159             : 
     160        1266 :     CPLFree(panSrcXOff);
     161             : 
     162        1266 :     return CE_None;
     163             : }
     164             : 
     165        1266 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
     166             :                                      const void *pChunk, void **ppDstBuffer,
     167             :                                      GDALDataType *peDstBufferDataType)
     168             : {
     169        1266 :     *peDstBufferDataType = args.eWrkDataType;
     170        1266 :     switch (args.eWrkDataType)
     171             :     {
     172             :         // For nearest resampling, as no computation is done, only the
     173             :         // size of the data type matters.
     174        1098 :         case GDT_UInt8:
     175             :         case GDT_Int8:
     176             :         {
     177        1098 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
     178        1098 :             return GDALResampleChunk_NearT(
     179             :                 args, static_cast<const uint8_t *>(pChunk),
     180        1098 :                 reinterpret_cast<uint8_t **>(ppDstBuffer));
     181             :         }
     182             : 
     183          52 :         case GDT_Int16:
     184             :         case GDT_UInt16:
     185             :         case GDT_Float16:
     186             :         {
     187          52 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
     188          52 :             return GDALResampleChunk_NearT(
     189             :                 args, static_cast<const uint16_t *>(pChunk),
     190          52 :                 reinterpret_cast<uint16_t **>(ppDstBuffer));
     191             :         }
     192             : 
     193          68 :         case GDT_CInt16:
     194             :         case GDT_CFloat16:
     195             :         case GDT_Int32:
     196             :         case GDT_UInt32:
     197             :         case GDT_Float32:
     198             :         {
     199          68 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
     200          68 :             return GDALResampleChunk_NearT(
     201             :                 args, static_cast<const uint32_t *>(pChunk),
     202          68 :                 reinterpret_cast<uint32_t **>(ppDstBuffer));
     203             :         }
     204             : 
     205          44 :         case GDT_CInt32:
     206             :         case GDT_CFloat32:
     207             :         case GDT_Int64:
     208             :         case GDT_UInt64:
     209             :         case GDT_Float64:
     210             :         {
     211          44 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
     212          44 :             return GDALResampleChunk_NearT(
     213             :                 args, static_cast<const uint64_t *>(pChunk),
     214          44 :                 reinterpret_cast<uint64_t **>(ppDstBuffer));
     215             :         }
     216             : 
     217           4 :         case GDT_CFloat64:
     218             :         {
     219           4 :             return GDALResampleChunk_NearT(
     220             :                 args, static_cast<const std::complex<double> *>(pChunk),
     221           4 :                 reinterpret_cast<std::complex<double> **>(ppDstBuffer));
     222             :         }
     223             : 
     224           0 :         case GDT_Unknown:
     225             :         case GDT_TypeCount:
     226           0 :             break;
     227             :     }
     228           0 :     CPLAssert(false);
     229             :     return CE_Failure;
     230             : }
     231             : 
     232             : namespace
     233             : {
     234             : 
     235             : // Find in the color table the entry whose RGB value is the closest
     236             : // (using quadratic distance) to the test color, ignoring transparent entries.
     237        3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
     238             :                    const GDALColorEntry &test)
     239             : {
     240        3837 :     int nMinDist = std::numeric_limits<int>::max();
     241        3837 :     size_t bestEntry = 0;
     242      986109 :     for (size_t i = 0; i < entries.size(); ++i)
     243             :     {
     244      982272 :         const GDALColorEntry &entry = entries[i];
     245             :         // Ignore transparent entries
     246      982272 :         if (entry.c4 == 0)
     247        3237 :             continue;
     248             : 
     249      979035 :         int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
     250      979035 :                     ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
     251      979035 :                     ((test.c3 - entry.c3) * (test.c3 - entry.c3));
     252      979035 :         if (nDist < nMinDist)
     253             :         {
     254       15847 :             nMinDist = nDist;
     255       15847 :             bestEntry = i;
     256             :         }
     257             :     }
     258        3837 :     return static_cast<int>(bestEntry);
     259             : }
     260             : 
     261           7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
     262             :                                            int &transparentIdx)
     263             : {
     264           7 :     std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
     265             : 
     266           7 :     transparentIdx = -1;
     267           7 :     int i = 0;
     268        1799 :     for (auto &entry : entries)
     269             :     {
     270        1792 :         table.GetColorEntryAsRGB(i, &entry);
     271        1792 :         if (transparentIdx < 0 && entry.c4 == 0)
     272           1 :             transparentIdx = i;
     273        1792 :         ++i;
     274             :     }
     275           7 :     return entries;
     276             : }
     277             : 
     278             : }  // unnamed  namespace
     279             : 
     280             : /************************************************************************/
     281             : /*                               SQUARE()                               */
     282             : /************************************************************************/
     283             : 
     284        6427 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
     285             : {
     286        6427 :     return static_cast<Tsquare>(val) * val;
     287             : }
     288             : 
     289             : /************************************************************************/
     290             : /*                         ComputeIntegerRMS()                          */
     291             : /************************************************************************/
     292             : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
     293             : // integer that minimizes abs(rms**2 - sumSquares / weight)
     294             : template <class T, class Twork>
     295          42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
     296             : {
     297          42 :     const double sumDivWeight = sumSquares / weight;
     298          42 :     T rms = static_cast<T>(sqrt(sumDivWeight));
     299             : 
     300             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     301             :     // Naive version:
     302             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     303          42 :     if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
     304          42 :         2 * sumDivWeight)
     305           6 :         rms += 1;
     306          42 :     return rms;
     307             : }
     308             : 
     309             : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
     310             : {
     311             :     CPLAssert(false);
     312             :     return 0;
     313             : }
     314             : 
     315          28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
     316             : {
     317             :     // It has been verified that given the correction on rms below, using
     318             :     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
     319             :     // is equivalent, so use the former as it is used twice.
     320          28 :     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
     321          28 :     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
     322          28 :     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
     323             : 
     324             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     325             :     // Naive version:
     326             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     327             :     // Optimized version for integer case and weight == 4
     328          28 :     if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
     329           5 :         rms += 1;
     330          28 :     return rms;
     331             : }
     332             : 
     333             : template <>
     334          24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
     335             : {
     336          24 :     const double sumDivWeight = sumSquares * 0.25;
     337          24 :     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
     338             : 
     339             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     340             :     // Naive version:
     341             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     342             :     // Optimized version for integer case and weight == 4
     343          24 :     if (static_cast<GUInt32>(rms) * (rms + 1) <
     344          24 :         static_cast<GUInt32>(sumDivWeight + 0.25))
     345           4 :         rms += 1;
     346          24 :     return rms;
     347             : }
     348             : 
     349             : #ifdef USE_SSE2
     350             : 
     351             : /************************************************************************/
     352             : /*                    QuadraticMeanByteSSE2OrAVX2()                     */
     353             : /************************************************************************/
     354             : 
     355             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     356             : #define sse2_hadd_epi16 _mm_hadd_epi16
     357             : #else
     358     4104270 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
     359             : {
     360             :     // Horizontal addition of adjacent pairs
     361     4104270 :     const auto mask = _mm_set1_epi32(0xFFFF);
     362             :     const auto horizLo =
     363    12312800 :         _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
     364             :     const auto horizHi =
     365    12312800 :         _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
     366             : 
     367             :     // Recombine low and high parts
     368     4104270 :     return _mm_packs_epi32(horizLo, horizHi);
     369             : }
     370             : #endif
     371             : 
     372             : #ifdef __AVX2__
     373             : 
     374             : #define set1_epi16 _mm256_set1_epi16
     375             : #define set1_epi32 _mm256_set1_epi32
     376             : #define setzero _mm256_setzero_si256
     377             : #define set1_ps _mm256_set1_ps
     378             : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
     379             : #define unpacklo_epi8 _mm256_unpacklo_epi8
     380             : #define unpackhi_epi8 _mm256_unpackhi_epi8
     381             : #define madd_epi16 _mm256_madd_epi16
     382             : #define add_epi32 _mm256_add_epi32
     383             : #define mul_ps _mm256_mul_ps
     384             : #define cvtepi32_ps _mm256_cvtepi32_ps
     385             : #define sqrt_ps _mm256_sqrt_ps
     386             : #define cvttps_epi32 _mm256_cvttps_epi32
     387             : #define packs_epi32 _mm256_packs_epi32
     388             : #define packus_epi32 _mm256_packus_epi32
     389             : #define srli_epi32 _mm256_srli_epi32
     390             : #define mullo_epi16 _mm256_mullo_epi16
     391             : #define srli_epi16 _mm256_srli_epi16
     392             : #define cmpgt_epi16 _mm256_cmpgt_epi16
     393             : #define add_epi16 _mm256_add_epi16
     394             : #define sub_epi16 _mm256_sub_epi16
     395             : #define packus_epi16 _mm256_packus_epi16
     396             : 
     397             : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
     398             : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
     399             :  */
     400             : 
     401             : inline __m256i FIXUP_LANES(__m256i x)
     402             : {
     403             :     return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
     404             : }
     405             : 
     406             : #define store_lo(x, y)                                                         \
     407             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
     408             :                      _mm256_extracti128_si256(FIXUP_LANES(y), 0))
     409             : #define storeu_int(x, y)                                                       \
     410             :     _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
     411             : #define hadd_epi16 _mm256_hadd_epi16
     412             : #else
     413             : #define set1_epi16 _mm_set1_epi16
     414             : #define set1_epi32 _mm_set1_epi32
     415             : #define setzero _mm_setzero_si128
     416             : #define set1_ps _mm_set1_ps
     417             : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
     418             : #define unpacklo_epi8 _mm_unpacklo_epi8
     419             : #define unpackhi_epi8 _mm_unpackhi_epi8
     420             : #define madd_epi16 _mm_madd_epi16
     421             : #define add_epi32 _mm_add_epi32
     422             : #define mul_ps _mm_mul_ps
     423             : #define cvtepi32_ps _mm_cvtepi32_ps
     424             : #define sqrt_ps _mm_sqrt_ps
     425             : #define cvttps_epi32 _mm_cvttps_epi32
     426             : #define packs_epi32 _mm_packs_epi32
     427             : #define packus_epi32 GDAL_mm_packus_epi32
     428             : #define srli_epi32 _mm_srli_epi32
     429             : #define mullo_epi16 _mm_mullo_epi16
     430             : #define srli_epi16 _mm_srli_epi16
     431             : #define cmpgt_epi16 _mm_cmpgt_epi16
     432             : #define add_epi16 _mm_add_epi16
     433             : #define sub_epi16 _mm_sub_epi16
     434             : #define packus_epi16 _mm_packus_epi16
     435             : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
     436             : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
     437             : #define hadd_epi16 sse2_hadd_epi16
     438             : #endif
     439             : 
     440             : template <class T>
     441             : static int
     442             : #if defined(__GNUC__)
     443             :     __attribute__((noinline))
     444             : #endif
     445        5389 :     QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     446             :                                 const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     447             :                                 T *CPL_RESTRICT pDstScanline)
     448             : {
     449             :     // Optimized implementation for RMS on Byte by
     450             :     // processing by group of 8 output pixels, so as to use
     451             :     // a single _mm_sqrt_ps() call for 4 output pixels
     452        5389 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     453             : 
     454        5389 :     int iDstPixel = 0;
     455        5389 :     const auto one16 = set1_epi16(1);
     456        5389 :     const auto one32 = set1_epi32(1);
     457        5389 :     const auto zero = setzero();
     458        5389 :     const auto minus32768 = set1_epi16(-32768);
     459             : 
     460        5389 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
     461      521504 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     462             :     {
     463             :         // Load 2 * DEST_ELTS bytes from each line
     464      516115 :         auto firstLine = loadu_int(pSrcScanlineShifted);
     465     1032230 :         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     466             :         // Extend those Bytes as UInt16s
     467      516115 :         auto firstLineLo = unpacklo_epi8(firstLine, zero);
     468      516115 :         auto firstLineHi = unpackhi_epi8(firstLine, zero);
     469      516115 :         auto secondLineLo = unpacklo_epi8(secondLine, zero);
     470      516115 :         auto secondLineHi = unpackhi_epi8(secondLine, zero);
     471             : 
     472             :         // Multiplication of 16 bit values and horizontal
     473             :         // addition of 32 bit results
     474             :         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
     475      516115 :         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
     476      516115 :         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
     477      516115 :         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
     478      516115 :         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
     479             : 
     480             :         // Vertical addition
     481      516115 :         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
     482      516115 :         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
     483             : 
     484             :         const auto sumSquaresPlusOneDiv4Lo =
     485     1032230 :             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
     486             :         const auto sumSquaresPlusOneDiv4Hi =
     487     1032230 :             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
     488             : 
     489             :         // Take square root and truncate/floor to int32
     490             :         const auto rmsLo =
     491     1548340 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
     492             :         const auto rmsHi =
     493     1548340 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
     494             : 
     495             :         // Merge back low and high registers with each RMS value
     496             :         // as a 16 bit value.
     497      516115 :         auto rms = packs_epi32(rmsLo, rmsHi);
     498             : 
     499             :         // Round to upper value if it minimizes the
     500             :         // error |rms^2 - sumSquares/4|
     501             :         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     502             :         //    rms += 1;
     503             :         // which is equivalent to:
     504             :         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
     505             :         //    rms += 1;
     506             :         // And both left and right parts fit on 16 (unsigned) bits
     507             :         const auto sumSquaresPlusOneDiv4 =
     508      516115 :             packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
     509             :         // cmpgt_epi16 operates on signed int16, but here
     510             :         // we have unsigned values, so shift them by -32768 before
     511     2580580 :         const auto mask = cmpgt_epi16(
     512             :             add_epi16(sumSquaresPlusOneDiv4, minus32768),
     513             :             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
     514             :         // The value of the mask will be -1 when the correction needs to be
     515             :         // applied
     516      516115 :         rms = sub_epi16(rms, mask);
     517             : 
     518             :         // Pack each 16 bit RMS value to 8 bits
     519      516115 :         rms = packus_epi16(rms, rms /* could be anything */);
     520      516115 :         store_lo(&pDstScanline[iDstPixel], rms);
     521      516115 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     522             :     }
     523             : 
     524        5389 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     525        5389 :     return iDstPixel;
     526             : }
     527             : 
     528             : /************************************************************************/
     529             : /*                       AverageByteSSE2OrAVX2()                        */
     530             : /************************************************************************/
     531             : 
     532             : static int
     533      120136 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     534             :                       const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     535             :                       GByte *CPL_RESTRICT pDstScanline)
     536             : {
     537             :     // Optimized implementation for average on Byte by
     538             :     // processing by group of 16 output pixels for SSE2, or 32 for AVX2
     539             : 
     540      120136 :     const auto zero = setzero();
     541      120136 :     const auto two16 = set1_epi16(2);
     542      120136 :     const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     543             : 
     544      120136 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
     545      120136 :     int iDstPixel = 0;
     546     2172270 :     for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
     547     2052130 :          iDstPixel += 2 * DEST_ELTS)
     548             :     {
     549             :         decltype(setzero()) average0;
     550             :         {
     551             :             // Load 2 * DEST_ELTS bytes from each line
     552     2052130 :             const auto firstLine = loadu_int(pSrcScanlineShifted);
     553             :             const auto secondLine =
     554     4104270 :                 loadu_int(pSrcScanlineShifted + nChunkXSize);
     555             :             // Extend those Bytes as UInt16s
     556     2052130 :             const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     557     2052130 :             const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     558     2052130 :             const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     559     2052130 :             const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     560             : 
     561             :             // Vertical addition
     562     2052130 :             const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     563     2052130 :             const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     564             : 
     565             :             // Horizontal addition of adjacent pairs, and recombine low and high
     566             :             // parts
     567     2052130 :             const auto sum = hadd_epi16(sumLo, sumHi);
     568             : 
     569             :             // average = (sum + 2) / 4
     570     2052130 :             average0 = srli_epi16(add_epi16(sum, two16), 2);
     571             : 
     572     2052130 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     573             :         }
     574             : 
     575             :         decltype(setzero()) average1;
     576             :         {
     577             :             // Load 2 * DEST_ELTS bytes from each line
     578     2052130 :             const auto firstLine = loadu_int(pSrcScanlineShifted);
     579             :             const auto secondLine =
     580     4104270 :                 loadu_int(pSrcScanlineShifted + nChunkXSize);
     581             :             // Extend those Bytes as UInt16s
     582     2052130 :             const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     583     2052130 :             const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     584     2052130 :             const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     585     2052130 :             const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     586             : 
     587             :             // Vertical addition
     588     2052130 :             const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     589     2052130 :             const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     590             : 
     591             :             // Horizontal addition of adjacent pairs, and recombine low and high
     592             :             // parts
     593     2052130 :             const auto sum = hadd_epi16(sumLo, sumHi);
     594             : 
     595             :             // average = (sum + 2) / 4
     596     2052130 :             average1 = srli_epi16(add_epi16(sum, two16), 2);
     597             : 
     598     2052130 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     599             :         }
     600             : 
     601             :         // Pack each 16 bit average value to 8 bits
     602     2052130 :         const auto average = packus_epi16(average0, average1);
     603     2052130 :         storeu_int(&pDstScanline[iDstPixel], average);
     604             :     }
     605             : 
     606      120136 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     607      120136 :     return iDstPixel;
     608             : }
     609             : 
     610             : /************************************************************************/
     611             : /*                      QuadraticMeanUInt16SSE2()                       */
     612             : /************************************************************************/
     613             : 
     614             : #ifdef __SSE3__
     615             : #define sse2_hadd_pd _mm_hadd_pd
     616             : #else
     617         185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
     618             : {
     619             :     auto aLo_bLo =
     620         740 :         _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
     621             :     auto aHi_bHi =
     622         740 :         _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
     623         185 :     return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
     624             : }
     625             : #endif
     626             : 
     627         120 : inline __m128d SQUARE_PD(__m128d x)
     628             : {
     629         120 :     return _mm_mul_pd(x, x);
     630             : }
     631             : 
     632             : #ifdef __AVX2__
     633             : 
     634             : inline __m256d SQUARE_PD(__m256d x)
     635             : {
     636             :     return _mm256_mul_pd(x, x);
     637             : }
     638             : 
     639             : inline __m256d FIXUP_LANES(__m256d x)
     640             : {
     641             :     return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
     642             : }
     643             : 
     644             : inline __m256 FIXUP_LANES(__m256 x)
     645             : {
     646             :     return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
     647             : }
     648             : 
     649             : #endif
     650             : 
     651             : static int
     652          14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
     653             :                         const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     654             :                         uint16_t *CPL_RESTRICT pDstScanline)
     655             : {
     656             :     // Optimized implementation for RMS on UInt16 by
     657             :     // processing by group of 4 output pixels.
     658          14 :     const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     659             : 
     660          14 :     int iDstPixel = 0;
     661          14 :     const auto zero = _mm_setzero_si128();
     662             : 
     663             : #ifdef __AVX2__
     664             :     const auto zeroDot25 = _mm256_set1_pd(0.25);
     665             :     const auto zeroDot5 = _mm256_set1_pd(0.5);
     666             : 
     667             :     // The first four 0's could be anything, as we only take the bottom
     668             :     // 128 bits.
     669             :     const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
     670             : #else
     671          14 :     const auto zeroDot25 = _mm_set1_pd(0.25);
     672          14 :     const auto zeroDot5 = _mm_set1_pd(0.5);
     673             : #endif
     674             : 
     675          14 :     constexpr int DEST_ELTS =
     676             :         static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
     677          52 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     678             :     {
     679             :         // Load 8 UInt16 from each line
     680          38 :         const auto firstLine = _mm_loadu_si128(
     681             :             reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     682             :         const auto secondLine =
     683          38 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     684          38 :                 pSrcScanlineShifted + nChunkXSize));
     685             : 
     686             :         // Detect if all of the source values fit in 14 bits.
     687             :         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
     688             :         // and we can do a much faster implementation.
     689             :         const auto maskTmp =
     690          76 :             _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
     691             : #if defined(__i386__) || defined(_M_IX86)
     692             :         uint64_t nMaskFitsIn14Bits = 0;
     693             :         _mm_storel_epi64(
     694             :             reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
     695             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     696             : #else
     697          38 :         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
     698             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     699             : #endif
     700          38 :         if (nMaskFitsIn14Bits == 0)
     701             :         {
     702             :             // Multiplication of 16 bit values and horizontal
     703             :             // addition of 32 bit results
     704             :             const auto firstLineHSumSquare =
     705          26 :                 _mm_madd_epi16(firstLine, firstLine);
     706             :             const auto secondLineHSumSquare =
     707          26 :                 _mm_madd_epi16(secondLine, secondLine);
     708             :             // Vertical addition
     709             :             const auto sumSquares =
     710          26 :                 _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
     711             :             // In theory we should take sqrt(sumSquares * 0.25f)
     712             :             // but given the rounding we do, this is equivalent to
     713             :             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
     714             :             // sumSquares <= 4 * 16383^2
     715          26 :             const auto one32 = _mm_set1_epi32(1);
     716             :             const auto sumSquaresPlusOneDiv4 =
     717          52 :                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
     718             :             // Take square root and truncate/floor to int32
     719          78 :             auto rms = _mm_cvttps_epi32(
     720             :                 _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
     721             : 
     722             :             // Round to upper value if it minimizes the
     723             :             // error |rms^2 - sumSquares/4|
     724             :             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     725             :             //    rms += 1;
     726             :             // which is equivalent to:
     727             :             // if( rms * rms + rms < (sumSquares+1) / 4 )
     728             :             //    rms += 1;
     729             :             auto mask =
     730          78 :                 _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
     731             :                                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
     732          26 :             rms = _mm_sub_epi32(rms, mask);
     733             :             // Pack each 32 bit RMS value to 16 bits
     734          26 :             rms = _mm_packs_epi32(rms, rms /* could be anything */);
     735             :             _mm_storel_epi64(
     736          26 :                 reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
     737          26 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     738          26 :             continue;
     739             :         }
     740             : 
     741             :         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
     742             :         // to 32 bit would result in 4 multiplications instead of 8, but
     743             :         // mullo/mulhi have a worse throughput than mul_pd.
     744             : 
     745             :         // Extend those UInt16s as UInt32s
     746          12 :         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
     747          12 :         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
     748          12 :         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
     749          12 :         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
     750             : 
     751             : #ifdef __AVX2__
     752             :         // Multiplication of 32 bit values previously converted to 64 bit double
     753             :         const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
     754             :         const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
     755             :         const auto secondLineLoDbl =
     756             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
     757             :         const auto secondLineHiDbl =
     758             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
     759             : 
     760             :         // Vertical addition of squares
     761             :         const auto sumSquaresLo =
     762             :             _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
     763             :         const auto sumSquaresHi =
     764             :             _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
     765             : 
     766             :         // Horizontal addition of squares
     767             :         const auto sumSquares =
     768             :             FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
     769             : 
     770             :         const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
     771             : 
     772             :         // Take square root and truncate/floor to int32
     773             :         auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
     774             :         const auto rmsDouble = _mm256_cvtepi32_pd(rms);
     775             :         const auto right = _mm256_sub_pd(
     776             :             sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
     777             : 
     778             :         auto mask =
     779             :             _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
     780             :         // Extract 32-bit from each of the 4 64-bit masks
     781             :         // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
     782             :         // _MM_SHUFFLE(2,0,2,0)));
     783             :         mask = _mm256_permutevar8x32_ps(mask, permutation);
     784             :         const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
     785             : 
     786             :         // Apply the correction
     787             :         rms = _mm_sub_epi32(rms, maskI);
     788             : 
     789             :         // Pack each 32 bit RMS value to 16 bits
     790             :         rms = _mm_packus_epi32(rms, rms /* could be anything */);
     791             : #else
     792             :         // Multiplication of 32 bit values previously converted to 64 bit double
     793          12 :         const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
     794             :         const auto firstLineLoHi =
     795          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
     796          12 :         const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
     797             :         const auto firstLineHiHi =
     798          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
     799             : 
     800          12 :         const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
     801             :         const auto secondLineLoHi =
     802          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
     803          12 :         const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
     804             :         const auto secondLineHiHi =
     805          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
     806             : 
     807             :         // Vertical addition of squares
     808          12 :         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
     809          12 :         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
     810          12 :         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
     811          12 :         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
     812             : 
     813             :         // Horizontal addition of squares
     814          12 :         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
     815          12 :         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
     816             : 
     817          12 :         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
     818          12 :         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
     819             :         // Take square root and truncate/floor to int32
     820          24 :         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
     821          24 :         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
     822             : 
     823             :         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
     824             :         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
     825             :         //     rms += 1;
     826          12 :         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
     827          12 :         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
     828          24 :         const auto rightLo = _mm_sub_pd(
     829             :             sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
     830          36 :         const auto rightHi = _mm_sub_pd(
     831             :             sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
     832             : 
     833          24 :         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
     834          12 :         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
     835             :         // The value of the mask will be -1 when the correction needs to be
     836             :         // applied
     837          24 :         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
     838             :             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
     839             : 
     840          48 :         auto rms = _mm_castps_si128(
     841             :             _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
     842             :         // Apply the correction
     843          12 :         rms = _mm_sub_epi32(rms, mask);
     844             : 
     845             :         // Pack each 32 bit RMS value to 16 bits
     846          12 :         rms = GDAL_mm_int32_to_uint16(rms);
     847             : #endif
     848             : 
     849          12 :         _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     850             :                          rms);
     851          12 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     852             :     }
     853             : 
     854          14 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     855          14 :     return iDstPixel;
     856             : }
     857             : 
     858             : /************************************************************************/
     859             : /*                         AverageUInt16SSE2()                          */
     860             : /************************************************************************/
     861             : 
     862             : static int
     863          13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
     864             :                   const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     865             :                   uint16_t *CPL_RESTRICT pDstScanline)
     866             : {
     867             :     // Optimized implementation for average on UInt16 by
     868             :     // processing by group of 8 output pixels.
     869             : 
     870          13 :     const auto mask = _mm_set1_epi32(0xFFFF);
     871          13 :     const auto two = _mm_set1_epi32(2);
     872          13 :     const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     873             : 
     874          13 :     int iDstPixel = 0;
     875          13 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
     876          25 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     877             :     {
     878             :         __m128i averageLow;
     879             :         // Load 8 UInt16 from each line
     880             :         {
     881          12 :             const auto firstLine = _mm_loadu_si128(
     882             :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     883             :             const auto secondLine =
     884          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     885          12 :                     pSrcScanlineShifted + nChunkXSize));
     886             : 
     887             :             // Horizontal addition and extension to 32 bit
     888          36 :             const auto horizAddFirstLine = _mm_add_epi32(
     889             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     890             :             const auto horizAddSecondLine =
     891          36 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     892             :                               _mm_srli_epi32(secondLine, 16));
     893             : 
     894             :             // Vertical addition and average computation
     895             :             // average = (sum + 2) >> 2
     896          24 :             const auto sum = _mm_add_epi32(
     897             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     898          12 :             averageLow = _mm_srli_epi32(sum, 2);
     899             :         }
     900             :         // Load 8 UInt16 from each line
     901             :         __m128i averageHigh;
     902             :         {
     903             :             const auto firstLine =
     904          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     905          12 :                     pSrcScanlineShifted + DEST_ELTS));
     906             :             const auto secondLine =
     907          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     908          12 :                     pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
     909             : 
     910             :             // Horizontal addition and extension to 32 bit
     911          36 :             const auto horizAddFirstLine = _mm_add_epi32(
     912             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     913             :             const auto horizAddSecondLine =
     914          36 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     915             :                               _mm_srli_epi32(secondLine, 16));
     916             : 
     917             :             // Vertical addition and average computation
     918             :             // average = (sum + 2) >> 2
     919          24 :             const auto sum = _mm_add_epi32(
     920             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     921          12 :             averageHigh = _mm_srli_epi32(sum, 2);
     922             :         }
     923             : 
     924             :         // Pack each 32 bit average value to 16 bits
     925          12 :         auto average = GDAL_mm_packus_epi32(averageLow, averageHigh);
     926          12 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     927             :                          average);
     928          12 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     929             :     }
     930             : 
     931          13 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     932          13 :     return iDstPixel;
     933             : }
     934             : 
     935             : /************************************************************************/
     936             : /*                       QuadraticMeanFloatSSE2()                       */
     937             : /************************************************************************/
     938             : 
     939             : #if !defined(ARM_V7)
     940             : 
     941             : #ifdef __SSE3__
     942             : #define sse2_hadd_ps _mm_hadd_ps
     943             : #else
     944          82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
     945             : {
     946          82 :     auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
     947          82 :     auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
     948          82 :     return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
     949             : }
     950             : #endif
     951             : 
     952             : #ifdef __AVX2__
     953             : #define set1_ps _mm256_set1_ps
     954             : #define loadu_ps _mm256_loadu_ps
     955             : #define andnot_ps _mm256_andnot_ps
     956             : #define and_ps _mm256_and_ps
     957             : #define max_ps _mm256_max_ps
     958             : #define shuffle_ps _mm256_shuffle_ps
     959             : #define div_ps _mm256_div_ps
     960             : #define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
     961             : #define mul_ps _mm256_mul_ps
     962             : #define add_ps _mm256_add_ps
     963             : #define hadd_ps _mm256_hadd_ps
     964             : #define sqrt_ps _mm256_sqrt_ps
     965             : #define or_ps _mm256_or_ps
     966             : #define unpacklo_ps _mm256_unpacklo_ps
     967             : #define unpackhi_ps _mm256_unpackhi_ps
     968             : #define storeu_ps _mm256_storeu_ps
     969             : #define blendv_ps _mm256_blendv_ps
     970             : 
     971             : inline __m256 SQUARE_PS(__m256 x)
     972             : {
     973             :     return _mm256_mul_ps(x, x);
     974             : }
     975             : 
     976             : #else
     977             : 
     978             : #define set1_ps _mm_set1_ps
     979             : #define loadu_ps _mm_loadu_ps
     980             : #define andnot_ps _mm_andnot_ps
     981             : #define and_ps _mm_and_ps
     982             : #define max_ps _mm_max_ps
     983             : #define shuffle_ps _mm_shuffle_ps
     984             : #define div_ps _mm_div_ps
     985             : #define cmpeq_ps _mm_cmpeq_ps
     986             : #define mul_ps _mm_mul_ps
     987             : #define add_ps _mm_add_ps
     988             : #define hadd_ps sse2_hadd_ps
     989             : #define sqrt_ps _mm_sqrt_ps
     990             : #define or_ps _mm_or_ps
     991             : #define unpacklo_ps _mm_unpacklo_ps
     992             : #define unpackhi_ps _mm_unpackhi_ps
     993             : #define storeu_ps _mm_storeu_ps
     994             : 
     995         132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
     996             : {
     997             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     998             :     return _mm_blendv_ps(a, b, mask);
     999             : #else
    1000         396 :     return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
    1001             : #endif
    1002             : }
    1003             : 
    1004         528 : inline __m128 SQUARE_PS(__m128 x)
    1005             : {
    1006         528 :     return _mm_mul_ps(x, x);
    1007             : }
    1008             : 
    1009         132 : inline __m128 FIXUP_LANES(__m128 x)
    1010             : {
    1011         132 :     return x;
    1012             : }
    1013             : 
    1014             : #endif
    1015             : 
    1016             : static int
    1017             : #if defined(__GNUC__)
    1018             :     __attribute__((noinline))
    1019             : #endif
    1020          66 :     QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
    1021             :                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1022             :                            float *CPL_RESTRICT pDstScanline)
    1023             : {
    1024             :     // Optimized implementation for RMS on Float32 by
    1025             :     // processing by group of output pixels.
    1026          66 :     const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1027             : 
    1028          66 :     int iDstPixel = 0;
    1029          66 :     const auto minus_zero = set1_ps(-0.0f);
    1030          66 :     const auto zeroDot25 = set1_ps(0.25f);
    1031          66 :     const auto one = set1_ps(1.0f);
    1032          66 :     const auto infv = set1_ps(std::numeric_limits<float>::infinity());
    1033          66 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
    1034             : 
    1035         198 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1036             :     {
    1037             :         // Load 2*DEST_ELTS Float32 from each line
    1038         132 :         auto firstLineLo = loadu_ps(pSrcScanlineShifted);
    1039         132 :         auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
    1040         132 :         auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
    1041             :         auto secondLineHi =
    1042         264 :             loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
    1043             : 
    1044             :         // Take the absolute value
    1045         132 :         firstLineLo = andnot_ps(minus_zero, firstLineLo);
    1046         132 :         firstLineHi = andnot_ps(minus_zero, firstLineHi);
    1047         132 :         secondLineLo = andnot_ps(minus_zero, secondLineLo);
    1048         132 :         secondLineHi = andnot_ps(minus_zero, secondLineHi);
    1049             : 
    1050             :         auto firstLineEven =
    1051         132 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1052             :         auto firstLineOdd =
    1053         132 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1054             :         auto secondLineEven =
    1055         132 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1056             :         auto secondLineOdd =
    1057         132 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1058             : 
    1059             :         // Compute the maximum of each DEST_ELTS value to RMS-average
    1060         396 :         const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
    1061             :                                  max_ps(secondLineEven, secondLineOdd));
    1062             : 
    1063             :         // Normalize each value by the maximum of the DEST_ELTS ones.
    1064             :         // This step is important to avoid that the square evaluates to infinity
    1065             :         // for sufficiently big input.
    1066         132 :         auto invMax = div_ps(one, maxV);
    1067             :         // Deal with 0 being the maximum to correct division by zero
    1068             :         // note: comparing to -0 leads to identical results as to comparing with
    1069             :         // 0
    1070         264 :         invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
    1071             : 
    1072         132 :         firstLineEven = mul_ps(firstLineEven, invMax);
    1073         132 :         firstLineOdd = mul_ps(firstLineOdd, invMax);
    1074         132 :         secondLineEven = mul_ps(secondLineEven, invMax);
    1075         132 :         secondLineOdd = mul_ps(secondLineOdd, invMax);
    1076             : 
    1077             :         // Compute squares
    1078         132 :         firstLineEven = SQUARE_PS(firstLineEven);
    1079         132 :         firstLineOdd = SQUARE_PS(firstLineOdd);
    1080         132 :         secondLineEven = SQUARE_PS(secondLineEven);
    1081         132 :         secondLineOdd = SQUARE_PS(secondLineOdd);
    1082             : 
    1083         396 :         const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
    1084             :                                        add_ps(secondLineEven, secondLineOdd));
    1085             : 
    1086         396 :         auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
    1087             : 
    1088             :         // Deal with infinity being the maximum
    1089         132 :         const auto maskIsInf = cmpeq_ps(maxV, infv);
    1090         132 :         rms = blendv_ps(rms, infv, maskIsInf);
    1091             : 
    1092         132 :         rms = FIXUP_LANES(rms);
    1093             : 
    1094         132 :         storeu_ps(&pDstScanline[iDstPixel], rms);
    1095         132 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1096             :     }
    1097             : 
    1098          66 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1099          66 :     return iDstPixel;
    1100             : }
    1101             : 
    1102             : /************************************************************************/
    1103             : /*                          AverageFloatSSE2()                          */
    1104             : /************************************************************************/
    1105             : 
    1106          50 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
    1107             :                             const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1108             :                             float *CPL_RESTRICT pDstScanline)
    1109             : {
    1110             :     // Optimized implementation for average on Float32 by
    1111             :     // processing by group of output pixels.
    1112          50 :     const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1113             : 
    1114          50 :     int iDstPixel = 0;
    1115          50 :     const auto zeroDot25 = _mm_set1_ps(0.25f);
    1116          50 :     constexpr int DEST_ELTS =
    1117             :         static_cast<int>(sizeof(zeroDot25) / sizeof(float));
    1118             : 
    1119         132 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1120             :     {
    1121             :         // Load 2 * DEST_ELTS Float32 from each line
    1122             :         const auto firstLineLo =
    1123          82 :             _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
    1124         164 :         const auto firstLineHi = _mm_mul_ps(
    1125             :             _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
    1126          82 :         const auto secondLineLo = _mm_mul_ps(
    1127          82 :             _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
    1128         164 :         const auto secondLineHi = _mm_mul_ps(
    1129          82 :             _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
    1130             :             zeroDot25);
    1131             : 
    1132             :         // Vertical addition
    1133          82 :         const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
    1134          82 :         const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
    1135             : 
    1136             :         // Horizontal addition
    1137          82 :         const auto average = sse2_hadd_ps(tmpLo, tmpHi);
    1138             : 
    1139          82 :         _mm_storeu_ps(&pDstScanline[iDstPixel], average);
    1140          82 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1141             :     }
    1142             : 
    1143          50 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1144          50 :     return iDstPixel;
    1145             : }
    1146             : 
    1147             : /************************************************************************/
    1148             : /*                         AverageDoubleSSE2()                          */
    1149             : /************************************************************************/
    1150             : 
    1151             : static int
    1152          50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
    1153             :                   const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1154             :                   double *CPL_RESTRICT pDstScanline)
    1155             : {
    1156             :     // Optimized implementation for average on Float64 by
    1157             :     // processing by group of output pixels.
    1158          50 :     const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1159             : 
    1160          50 :     int iDstPixel = 0;
    1161          50 :     const auto zeroDot25 = _mm_set1_pd(0.25);
    1162          50 :     constexpr int DEST_ELTS =
    1163             :         static_cast<int>(sizeof(zeroDot25) / sizeof(double));
    1164             : 
    1165         211 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1166             :     {
    1167             :         // Load 4 * DEST_ELTS Float64 from each line
    1168         161 :         const auto firstLine0 = _mm_mul_pd(
    1169             :             _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
    1170         322 :         const auto firstLine1 = _mm_mul_pd(
    1171             :             _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
    1172         161 :         const auto secondLine0 = _mm_mul_pd(
    1173         161 :             _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
    1174             :             zeroDot25);
    1175         322 :         const auto secondLine1 = _mm_mul_pd(
    1176         161 :             _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
    1177             :             zeroDot25);
    1178             : 
    1179             :         // Vertical addition
    1180         161 :         const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
    1181         161 :         const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
    1182             : 
    1183             :         // Horizontal addition
    1184         161 :         const auto average0 = sse2_hadd_pd(tmp0, tmp1);
    1185             : 
    1186         161 :         _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
    1187         161 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1188             :     }
    1189             : 
    1190          50 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1191          50 :     return iDstPixel;
    1192             : }
    1193             : 
    1194             : #endif
    1195             : 
    1196             : #endif
    1197             : 
    1198             : /************************************************************************/
    1199             : /*                   GDALResampleChunk_AverageOrRMS()                   */
    1200             : /************************************************************************/
    1201             : 
    1202             : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
    1203             : static CPLErr
    1204        7347 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
    1205             :                                  const T *pChunk, void **ppDstBuffer)
    1206             : {
    1207        7347 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1208        7347 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1209        7347 :     const double dfSrcXDelta = args.dfSrcXDelta;
    1210        7347 :     const double dfSrcYDelta = args.dfSrcYDelta;
    1211        7347 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1212        7347 :     const int nChunkXOff = args.nChunkXOff;
    1213        7347 :     const int nChunkYOff = args.nChunkYOff;
    1214        7347 :     const int nChunkXSize = args.nChunkXSize;
    1215        7347 :     const int nChunkYSize = args.nChunkYSize;
    1216        7347 :     const int nDstXOff = args.nDstXOff;
    1217        7347 :     const int nDstXOff2 = args.nDstXOff2;
    1218        7347 :     const int nDstYOff = args.nDstYOff;
    1219        7347 :     const int nDstYOff2 = args.nDstYOff2;
    1220        7347 :     const char *pszResampling = args.pszResampling;
    1221        7347 :     bool bHasNoData = args.bHasNoData;
    1222        7347 :     const double dfNoDataValue = args.dfNoDataValue;
    1223        7347 :     const GDALColorTable *const poColorTable =
    1224             :         !bQuadraticMean &&
    1225             :                 // AVERAGE_BIT2GRAYSCALE
    1226        7264 :                 STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G")
    1227             :             ? nullptr
    1228             :             : args.poColorTable;
    1229        7347 :     const bool bPropagateNoData = args.bPropagateNoData;
    1230             : 
    1231        7347 :     T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
    1232        7347 :     const T tReplacementVal =
    1233         206 :         bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
    1234          72 :                          args.eOvrDataType, dfNoDataValue))
    1235             :                    : 0;
    1236             : 
    1237        7347 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1238        7347 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1239        7347 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    1240             : 
    1241             :     /* -------------------------------------------------------------------- */
    1242             :     /*      Allocate buffers.                                               */
    1243             :     /* -------------------------------------------------------------------- */
    1244        7347 :     *ppDstBuffer = static_cast<T *>(
    1245        7347 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
    1246             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
    1247        7347 :     if (*ppDstBuffer == nullptr)
    1248             :     {
    1249           0 :         return CE_Failure;
    1250             :     }
    1251        7347 :     T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
    1252             : 
    1253             :     struct PrecomputedXValue
    1254             :     {
    1255             :         int nLeftXOffShifted;
    1256             :         int nRightXOffShifted;
    1257             :         double dfLeftWeight;
    1258             :         double dfRightWeight;
    1259             :         double dfTotalWeightFullLine;
    1260             :     };
    1261             : 
    1262             :     PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
    1263        7347 :         VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
    1264             : 
    1265        7347 :     if (pasSrcX == nullptr)
    1266             :     {
    1267           0 :         return CE_Failure;
    1268             :     }
    1269             : 
    1270        7347 :     std::vector<GDALColorEntry> colorEntries;
    1271             : 
    1272        7347 :     if (poColorTable)
    1273             :     {
    1274           5 :         int nTransparentIdx = -1;
    1275           5 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1276             : 
    1277             :         // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1278             :         // it as nodata value
    1279           6 :         if (bHasNoData && dfNoDataValue >= 0.0 &&
    1280           1 :             tNoDataValue < colorEntries.size())
    1281           1 :             colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
    1282             : 
    1283             :         // Or if we have no explicit nodata, but a color table entry that is
    1284             :         // transparent, consider it as the nodata value
    1285           4 :         else if (!bHasNoData && nTransparentIdx >= 0)
    1286             :         {
    1287           0 :             bHasNoData = true;
    1288           0 :             tNoDataValue = static_cast<T>(nTransparentIdx);
    1289             :         }
    1290             :     }
    1291             : 
    1292             :     /* ==================================================================== */
    1293             :     /*      Precompute inner loop constants.                                */
    1294             :     /* ==================================================================== */
    1295        7347 :     bool bSrcXSpacingIsTwo = true;
    1296        7347 :     int nLastSrcXOff2 = -1;
    1297     1659150 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1298             :     {
    1299     1651805 :         const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    1300             :         // Apply some epsilon to avoid numerical precision issues
    1301     1651805 :         const int nSrcXOff =
    1302     1651805 :             std::max(static_cast<int>(dfSrcXOff + 1e-8), nChunkXOff);
    1303     1651805 :         const double dfSrcXOff2 =
    1304     1651805 :             dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    1305     1651805 :         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    1306     1651805 :         if (nSrcXOff2 == nSrcXOff)
    1307           0 :             nSrcXOff2++;
    1308     1651805 :         if (nSrcXOff2 > nChunkRightXOff)
    1309           1 :             nSrcXOff2 = nChunkRightXOff;
    1310             : 
    1311     1651805 :         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
    1312     1651805 :         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
    1313     1651805 :             nSrcXOff2 - nChunkXOff;
    1314          21 :         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
    1315     1651805 :             (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
    1316     1651805 :         pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
    1317     1651805 :             1 - (nSrcXOff2 - dfSrcXOff2);
    1318     1651805 :         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
    1319     1651805 :             pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
    1320     1651805 :         if (nSrcXOff + 1 < nSrcXOff2)
    1321             :         {
    1322     1651779 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1323     1651779 :                 nSrcXOff2 - nSrcXOff - 2;
    1324     1651779 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1325     1651779 :                 pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
    1326             :         }
    1327             : 
    1328     1651805 :         if (nSrcXOff2 - nSrcXOff != 2 ||
    1329     1553902 :             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
    1330             :         {
    1331       91989 :             bSrcXSpacingIsTwo = false;
    1332             :         }
    1333     1651805 :         nLastSrcXOff2 = nSrcXOff2;
    1334             :     }
    1335             : 
    1336             :     /* ==================================================================== */
    1337             :     /*      Loop over destination scanlines.                                */
    1338             :     /* ==================================================================== */
    1339      701567 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1340             :     {
    1341      694220 :         const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    1342      694220 :         int nSrcYOff = std::max(static_cast<int>(dfSrcYOff + 1e-8), nChunkYOff);
    1343             : 
    1344      694220 :         const double dfSrcYOff2 =
    1345      694220 :             dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    1346      694220 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    1347      694220 :         if (nSrcYOff2 == nSrcYOff)
    1348           0 :             ++nSrcYOff2;
    1349      694220 :         if (nSrcYOff2 > nChunkBottomYOff)
    1350           3 :             nSrcYOff2 = nChunkBottomYOff;
    1351             : 
    1352      694220 :         T *const pDstScanline =
    1353      694220 :             pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
    1354             : 
    1355             :         /* --------------------------------------------------------------------
    1356             :          */
    1357             :         /*      Loop over destination pixels */
    1358             :         /* --------------------------------------------------------------------
    1359             :          */
    1360      694220 :         if (poColorTable == nullptr)
    1361             :         {
    1362      694105 :             if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
    1363             :                 pabyChunkNodataMask == nullptr)
    1364             :             {
    1365             :                 if constexpr (eWrkDataType == GDT_UInt8 ||
    1366             :                               eWrkDataType == GDT_UInt16)
    1367             :                 {
    1368             :                     // Optimized case : no nodata, overview by a factor of 2 and
    1369             :                     // regular x and y src spacing.
    1370      125552 :                     const T *pSrcScanlineShifted =
    1371      125552 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1372      125552 :                         static_cast<size_t>(nSrcYOff - nChunkYOff) *
    1373      125552 :                             nChunkXSize;
    1374      125552 :                     int iDstPixel = 0;
    1375             : #ifdef USE_SSE2
    1376             :                     if constexpr (eWrkDataType == GDT_UInt8)
    1377             :                     {
    1378             :                         if constexpr (bQuadraticMean)
    1379             :                         {
    1380        5389 :                             iDstPixel = QuadraticMeanByteSSE2OrAVX2(
    1381             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1382             :                                 pDstScanline);
    1383             :                         }
    1384             :                         else
    1385             :                         {
    1386      120136 :                             iDstPixel = AverageByteSSE2OrAVX2(
    1387             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1388             :                                 pDstScanline);
    1389             :                         }
    1390             :                     }
    1391             :                     else
    1392             :                     {
    1393             :                         static_assert(eWrkDataType == GDT_UInt16);
    1394             :                         if constexpr (bQuadraticMean)
    1395             :                         {
    1396          14 :                             iDstPixel = QuadraticMeanUInt16SSE2(
    1397             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1398             :                                 pDstScanline);
    1399             :                         }
    1400             :                         else
    1401             :                         {
    1402          13 :                             iDstPixel = AverageUInt16SSE2(
    1403             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1404             :                                 pDstScanline);
    1405             :                         }
    1406             :                     }
    1407             : #endif
    1408      300011 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1409             :                     {
    1410      174459 :                         Tsum nTotal = 0;
    1411             :                         T nVal;
    1412             :                         if constexpr (bQuadraticMean)
    1413          52 :                             nTotal =
    1414          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[0]) +
    1415          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[1]) +
    1416          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
    1417          52 :                                 SQUARE<Tsum>(
    1418          52 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1419             :                         else
    1420      174407 :                             nTotal = pSrcScanlineShifted[0] +
    1421      174407 :                                      pSrcScanlineShifted[1] +
    1422      174407 :                                      pSrcScanlineShifted[nChunkXSize] +
    1423      174407 :                                      pSrcScanlineShifted[1 + nChunkXSize];
    1424             : 
    1425      174459 :                         constexpr int nTotalWeight = 4;
    1426             :                         if constexpr (bQuadraticMean)
    1427          52 :                             nVal = ComputeIntegerRMS_4values<T>(nTotal);
    1428             :                         else
    1429      174407 :                             nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
    1430             :                                                   nTotalWeight);
    1431             : 
    1432             :                         // No need to compare nVal against tNoDataValue as we
    1433             :                         // are in a case where pabyChunkNodataMask == nullptr
    1434             :                         // implies the absence of nodata value.
    1435      174459 :                         pDstScanline[iDstPixel] = nVal;
    1436      174459 :                         pSrcScanlineShifted += 2;
    1437             :                     }
    1438             :                 }
    1439             :                 else
    1440             :                 {
    1441             :                     static_assert(eWrkDataType == GDT_Float32 ||
    1442             :                                   eWrkDataType == GDT_Float64);
    1443         202 :                     const T *pSrcScanlineShifted =
    1444         202 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1445         202 :                         static_cast<size_t>(nSrcYOff - nChunkYOff) *
    1446         202 :                             nChunkXSize;
    1447         202 :                     int iDstPixel = 0;
    1448             : #if defined(USE_SSE2) && !defined(ARM_V7)
    1449             :                     if constexpr (eWrkDataType == GDT_Float32)
    1450             :                     {
    1451             :                         static_assert(std::is_same_v<T, float>);
    1452             :                         if constexpr (bQuadraticMean)
    1453             :                         {
    1454          66 :                             iDstPixel = QuadraticMeanFloatSSE2(
    1455             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1456             :                                 pDstScanline);
    1457             :                         }
    1458             :                         else
    1459             :                         {
    1460          50 :                             iDstPixel = AverageFloatSSE2(
    1461             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1462             :                                 pDstScanline);
    1463             :                         }
    1464             :                     }
    1465             :                     else
    1466             :                     {
    1467             :                         if constexpr (!bQuadraticMean)
    1468             :                         {
    1469          50 :                             iDstPixel = AverageDoubleSSE2(
    1470             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1471             :                                 pDstScanline);
    1472             :                         }
    1473             :                     }
    1474             : #endif
    1475             : 
    1476         726 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1477             :                     {
    1478             :                         T nVal;
    1479             : 
    1480             :                         if constexpr (bQuadraticMean)
    1481             :                         {
    1482             :                             // Avoid issues with large values by renormalizing
    1483          96 :                             const auto max = std::max(
    1484         420 :                                 {std::fabs(pSrcScanlineShifted[0]),
    1485         420 :                                  std::fabs(pSrcScanlineShifted[1]),
    1486         420 :                                  std::fabs(pSrcScanlineShifted[nChunkXSize]),
    1487         420 :                                  std::fabs(
    1488         420 :                                      pSrcScanlineShifted[1 + nChunkXSize])});
    1489         420 :                             if (max == 0)
    1490             :                             {
    1491           8 :                                 nVal = 0;
    1492             :                             }
    1493         412 :                             else if (std::isinf(max))
    1494             :                             {
    1495             :                                 // If there is at least one infinity value,
    1496             :                                 // then just summing, and taking the abs
    1497             :                                 // value will give the expected result:
    1498             :                                 // * +inf if all values are +inf
    1499             :                                 // * +inf if all values are -inf
    1500             :                                 // * NaN otherwise
    1501          82 :                                 nVal = std::fabs(
    1502          82 :                                     pSrcScanlineShifted[0] +
    1503          82 :                                     pSrcScanlineShifted[1] +
    1504          82 :                                     pSrcScanlineShifted[nChunkXSize] +
    1505          82 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1506             :                             }
    1507             :                             else
    1508             :                             {
    1509         330 :                                 const auto inv_max = static_cast<T>(1.0) / max;
    1510         330 :                                 nVal =
    1511             :                                     max *
    1512         330 :                                     std::sqrt(
    1513             :                                         static_cast<T>(0.25) *
    1514         330 :                                         (SQUARE(pSrcScanlineShifted[0] *
    1515         330 :                                                 inv_max) +
    1516         330 :                                          SQUARE(pSrcScanlineShifted[1] *
    1517         330 :                                                 inv_max) +
    1518         330 :                                          SQUARE(
    1519         330 :                                              pSrcScanlineShifted[nChunkXSize] *
    1520         330 :                                              inv_max) +
    1521         330 :                                          SQUARE(
    1522         330 :                                              pSrcScanlineShifted[1 +
    1523             :                                                                  nChunkXSize] *
    1524             :                                              inv_max)));
    1525             :                             }
    1526             :                         }
    1527             :                         else
    1528             :                         {
    1529         104 :                             constexpr auto weight = static_cast<T>(0.25);
    1530             :                             // Multiply each value by weight to avoid
    1531             :                             // potential overflow
    1532         104 :                             nVal =
    1533         104 :                                 (weight * pSrcScanlineShifted[0] +
    1534         104 :                                  weight * pSrcScanlineShifted[1] +
    1535         104 :                                  weight * pSrcScanlineShifted[nChunkXSize] +
    1536         104 :                                  weight * pSrcScanlineShifted[1 + nChunkXSize]);
    1537             :                         }
    1538             : 
    1539             :                         // No need to compare nVal against tNoDataValue as we
    1540             :                         // are in a case where pabyChunkNodataMask == nullptr
    1541             :                         // implies the absence of nodata value.
    1542         524 :                         pDstScanline[iDstPixel] = nVal;
    1543         524 :                         pSrcScanlineShifted += 2;
    1544             :                     }
    1545      125754 :                 }
    1546             :             }
    1547             :             else
    1548             :             {
    1549          17 :                 const double dfBottomWeight =
    1550      568351 :                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0
    1551      568334 :                                                 : 1.0 - (dfSrcYOff - nSrcYOff);
    1552      568351 :                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
    1553      568351 :                 nSrcYOff -= nChunkYOff;
    1554      568351 :                 nSrcYOff2 -= nChunkYOff;
    1555             : 
    1556      568351 :                 double dfTotalWeightFullColumn = dfBottomWeight;
    1557      568351 :                 if (nSrcYOff + 1 < nSrcYOff2)
    1558             :                 {
    1559      568334 :                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
    1560      568334 :                     dfTotalWeightFullColumn += dfTopWeight;
    1561             :                 }
    1562             : 
    1563     9784185 :                 for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1564             :                 {
    1565     9215839 :                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1566     9215839 :                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1567             : 
    1568     9215839 :                     double dfTotal = 0;
    1569     9215839 :                     double dfTotalWeight = 0;
    1570     9215839 :                     [[maybe_unused]] double dfMulFactor = 1.0;
    1571     9215839 :                     [[maybe_unused]] double dfInvMulFactor = 1.0;
    1572     9215839 :                     constexpr bool bUseMulFactor =
    1573             :                         (eWrkDataType == GDT_Float32 ||
    1574             :                          eWrkDataType == GDT_Float64);
    1575     9215839 :                     if (pabyChunkNodataMask == nullptr)
    1576             :                     {
    1577             :                         if constexpr (bUseMulFactor)
    1578             :                         {
    1579             :                             if constexpr (bQuadraticMean)
    1580             :                             {
    1581          80 :                                 T mulFactor = 0;
    1582          80 :                                 auto pChunkShifted =
    1583          80 :                                     pChunk +
    1584          80 :                                     static_cast<size_t>(nSrcYOff) * nChunkXSize;
    1585             : 
    1586         240 :                                 for (int iY = nSrcYOff; iY < nSrcYOff2;
    1587         160 :                                      ++iY, pChunkShifted += nChunkXSize)
    1588             :                                 {
    1589         480 :                                     for (int iX = nSrcXOff; iX < nSrcXOff2;
    1590             :                                          ++iX)
    1591         640 :                                         mulFactor = std::max(
    1592             :                                             mulFactor,
    1593         320 :                                             std::fabs(pChunkShifted[iX]));
    1594             :                                 }
    1595          80 :                                 dfMulFactor = double(mulFactor);
    1596         142 :                                 dfInvMulFactor =
    1597          62 :                                     dfMulFactor > 0 &&
    1598          62 :                                             std::isfinite(dfMulFactor)
    1599             :                                         ? 1.0 / dfMulFactor
    1600             :                                         : 1.0;
    1601             :                             }
    1602             :                             else
    1603             :                             {
    1604         139 :                                 dfMulFactor = (nSrcYOff2 - nSrcYOff) *
    1605         139 :                                               (nSrcXOff2 - nSrcXOff);
    1606         139 :                                 dfInvMulFactor = 1.0 / dfMulFactor;
    1607             :                             }
    1608             :                         }
    1609             : 
    1610     1746545 :                         auto pChunkShifted =
    1611         227 :                             pChunk +
    1612     1746545 :                             static_cast<size_t>(nSrcYOff) * nChunkXSize;
    1613     1746545 :                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
    1614     1746545 :                         double dfWeightY = dfBottomWeight;
    1615     3493539 :                         while (true)
    1616             :                         {
    1617             :                             double dfTotalLine;
    1618             :                             if constexpr (bQuadraticMean)
    1619             :                             {
    1620             :                                 // Left pixel
    1621             :                                 {
    1622         216 :                                     const T val = pChunkShifted[nSrcXOff];
    1623         216 :                                     dfTotalLine =
    1624         216 :                                         SQUARE(double(val) * dfInvMulFactor) *
    1625         216 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1626             :                                 }
    1627             : 
    1628         216 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1629             :                                 {
    1630             :                                     // Middle pixels
    1631         216 :                                     for (int iX = nSrcXOff + 1;
    1632         536 :                                          iX < nSrcXOff2 - 1; ++iX)
    1633             :                                     {
    1634         320 :                                         const T val = pChunkShifted[iX];
    1635         320 :                                         dfTotalLine += SQUARE(double(val) *
    1636             :                                                               dfInvMulFactor);
    1637             :                                     }
    1638             : 
    1639             :                                     // Right pixel
    1640             :                                     {
    1641         216 :                                         const T val =
    1642         216 :                                             pChunkShifted[nSrcXOff2 - 1];
    1643         216 :                                         dfTotalLine +=
    1644         216 :                                             SQUARE(double(val) *
    1645         216 :                                                    dfInvMulFactor) *
    1646         216 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1647             :                                     }
    1648             :                                 }
    1649             :                             }
    1650             :                             else
    1651             :                             {
    1652             :                                 // Left pixel
    1653             :                                 {
    1654     5239868 :                                     const T val = pChunkShifted[nSrcXOff];
    1655     5239868 :                                     dfTotalLine =
    1656     5239868 :                                         double(val) * dfInvMulFactor *
    1657     5239868 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1658             :                                 }
    1659             : 
    1660     5239868 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1661             :                                 {
    1662             :                                     // Middle pixels
    1663     4239442 :                                     for (int iX = nSrcXOff + 1;
    1664    64183238 :                                          iX < nSrcXOff2 - 1; ++iX)
    1665             :                                     {
    1666    59943836 :                                         const T val = pChunkShifted[iX];
    1667    59943836 :                                         dfTotalLine +=
    1668    59943836 :                                             double(val) * dfInvMulFactor;
    1669             :                                     }
    1670             : 
    1671             :                                     // Right pixel
    1672             :                                     {
    1673     4239442 :                                         const T val =
    1674     4239442 :                                             pChunkShifted[nSrcXOff2 - 1];
    1675     4239442 :                                         dfTotalLine +=
    1676     4239442 :                                             double(val) * dfInvMulFactor *
    1677     4239442 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1678             :                                     }
    1679             :                                 }
    1680             :                             }
    1681             : 
    1682     5240084 :                             dfTotal += dfTotalLine * dfWeightY;
    1683     5240084 :                             --nCounterY;
    1684     5240084 :                             if (nCounterY < 0)
    1685     1746545 :                                 break;
    1686     3493539 :                             pChunkShifted += nChunkXSize;
    1687     3493539 :                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
    1688             :                         }
    1689             : 
    1690     1746545 :                         dfTotalWeight =
    1691     1746545 :                             pasSrcX[iDstPixel].dfTotalWeightFullLine *
    1692             :                             dfTotalWeightFullColumn;
    1693             :                     }
    1694             :                     else
    1695             :                     {
    1696     7469294 :                         size_t nCount = 0;
    1697    30285576 :                         for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1698             :                         {
    1699    22816292 :                             const auto pChunkShifted =
    1700    22816292 :                                 pChunk + static_cast<size_t>(iY) * nChunkXSize;
    1701             : 
    1702    22816292 :                             double dfTotalLine = 0;
    1703    22816292 :                             double dfTotalWeightLine = 0;
    1704             :                             // Left pixel
    1705             :                             {
    1706    22816292 :                                 const int iX = nSrcXOff;
    1707    22816292 :                                 const T val = pChunkShifted[iX];
    1708    22816292 :                                 if (pabyChunkNodataMask
    1709    22816292 :                                         [iX +
    1710    22816292 :                                          static_cast<size_t>(iY) * nChunkXSize])
    1711             :                                 {
    1712    17325139 :                                     nCount++;
    1713    17325139 :                                     const double dfWeightX =
    1714    17325139 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1715    17325139 :                                     dfTotalWeightLine = dfWeightX;
    1716             :                                     if constexpr (bQuadraticMean)
    1717         508 :                                         dfTotalLine =
    1718         508 :                                             SQUARE(double(val)) * dfWeightX;
    1719             :                                     else
    1720    17324631 :                                         dfTotalLine = double(val) * dfWeightX;
    1721             :                                 }
    1722             :                             }
    1723             : 
    1724    22816292 :                             if (nSrcXOff < nSrcXOff2 - 1)
    1725             :                             {
    1726             :                                 // Middle pixels
    1727    61618372 :                                 for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
    1728             :                                      ++iX)
    1729             :                                 {
    1730    38802080 :                                     const T val = pChunkShifted[iX];
    1731    38802080 :                                     if (pabyChunkNodataMask
    1732    38802080 :                                             [iX + static_cast<size_t>(iY) *
    1733    38802080 :                                                       nChunkXSize])
    1734             :                                     {
    1735    28038780 :                                         nCount++;
    1736    28038780 :                                         dfTotalWeightLine += 1;
    1737             :                                         if constexpr (bQuadraticMean)
    1738         640 :                                             dfTotalLine += SQUARE(double(val));
    1739             :                                         else
    1740    28038140 :                                             dfTotalLine += double(val);
    1741             :                                     }
    1742             :                                 }
    1743             : 
    1744             :                                 // Right pixel
    1745             :                                 {
    1746    22816292 :                                     const int iX = nSrcXOff2 - 1;
    1747    22816292 :                                     const T val = pChunkShifted[iX];
    1748    22816292 :                                     if (pabyChunkNodataMask
    1749    22816292 :                                             [iX + static_cast<size_t>(iY) *
    1750    22816292 :                                                       nChunkXSize])
    1751             :                                     {
    1752    17324495 :                                         nCount++;
    1753    17324495 :                                         const double dfWeightX =
    1754    17324495 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1755    17324495 :                                         dfTotalWeightLine += dfWeightX;
    1756             :                                         if constexpr (bQuadraticMean)
    1757         503 :                                             dfTotalLine +=
    1758         503 :                                                 SQUARE(double(val)) * dfWeightX;
    1759             :                                         else
    1760    17323992 :                                             dfTotalLine +=
    1761    17323992 :                                                 double(val) * dfWeightX;
    1762             :                                     }
    1763             :                                 }
    1764             :                             }
    1765             : 
    1766    38163300 :                             const double dfWeightY =
    1767             :                                 (iY == nSrcYOff)        ? dfBottomWeight
    1768    15347008 :                                 : (iY + 1 == nSrcYOff2) ? dfTopWeight
    1769             :                                                         : 1.0;
    1770    22816292 :                             dfTotal += dfTotalLine * dfWeightY;
    1771    22816292 :                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
    1772             :                         }
    1773             : 
    1774     7469294 :                         if (nCount == 0 ||
    1775           8 :                             (bPropagateNoData &&
    1776             :                              nCount <
    1777           8 :                                  static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    1778           8 :                                      (nSrcXOff2 - nSrcXOff)))
    1779             :                         {
    1780     2307682 :                             pDstScanline[iDstPixel] = tNoDataValue;
    1781     2307682 :                             continue;
    1782             :                         }
    1783             :                     }
    1784             :                     if constexpr (eWrkDataType == GDT_UInt8)
    1785             :                     {
    1786             :                         T nVal;
    1787             :                         if constexpr (bQuadraticMean)
    1788          38 :                             nVal = ComputeIntegerRMS<T, int>(dfTotal,
    1789             :                                                              dfTotalWeight);
    1790             :                         else
    1791     6901260 :                             nVal =
    1792     6901260 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1793     6901298 :                         if (bHasNoData && nVal == tNoDataValue)
    1794           0 :                             nVal = tReplacementVal;
    1795     6901298 :                         pDstScanline[iDstPixel] = nVal;
    1796             :                     }
    1797             :                     else if constexpr (eWrkDataType == GDT_UInt16)
    1798             :                     {
    1799             :                         T nVal;
    1800             :                         if constexpr (bQuadraticMean)
    1801           4 :                             nVal = ComputeIntegerRMS<T, uint64_t>(
    1802             :                                 dfTotal, dfTotalWeight);
    1803             :                         else
    1804           4 :                             nVal =
    1805           4 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1806           8 :                         if (bHasNoData && nVal == tNoDataValue)
    1807           0 :                             nVal = tReplacementVal;
    1808           8 :                         pDstScanline[iDstPixel] = nVal;
    1809             :                     }
    1810             :                     else
    1811             :                     {
    1812             :                         T nVal;
    1813             :                         if constexpr (bQuadraticMean)
    1814             :                         {
    1815             :                             if constexpr (bUseMulFactor)
    1816         249 :                                 nVal = static_cast<T>(
    1817         132 :                                     dfMulFactor *
    1818         249 :                                     sqrt(dfTotal / dfTotalWeight));
    1819             :                             else
    1820             :                                 nVal = static_cast<T>(
    1821             :                                     sqrt(dfTotal / dfTotalWeight));
    1822             :                         }
    1823             :                         else
    1824             :                         {
    1825             :                             if constexpr (bUseMulFactor)
    1826        6602 :                                 nVal = static_cast<T>(
    1827        6602 :                                     dfMulFactor * (dfTotal / dfTotalWeight));
    1828             :                             else
    1829             :                                 nVal = static_cast<T>(dfTotal / dfTotalWeight);
    1830             :                         }
    1831        6851 :                         if (bHasNoData && nVal == tNoDataValue)
    1832           2 :                             nVal = tReplacementVal;
    1833        6851 :                         pDstScanline[iDstPixel] = nVal;
    1834             :                     }
    1835             :                 }
    1836             :             }
    1837             :         }
    1838             :         else
    1839             :         {
    1840         115 :             nSrcYOff -= nChunkYOff;
    1841         115 :             nSrcYOff2 -= nChunkYOff;
    1842             : 
    1843        6590 :             for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1844             :             {
    1845        6475 :                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1846        6475 :                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1847             : 
    1848        6475 :                 uint64_t nTotalR = 0;
    1849        6475 :                 uint64_t nTotalG = 0;
    1850        6475 :                 uint64_t nTotalB = 0;
    1851        6475 :                 size_t nCount = 0;
    1852             : 
    1853       19425 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1854             :                 {
    1855       38850 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    1856             :                     {
    1857       25900 :                         const T val =
    1858       25900 :                             pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
    1859             :                         // cppcheck-suppress unsignedLessThanZero
    1860       25900 :                         if (val < 0 || val >= colorEntries.size())
    1861           0 :                             continue;
    1862       25900 :                         const size_t idx = static_cast<size_t>(val);
    1863       25900 :                         const auto &entry = colorEntries[idx];
    1864       25900 :                         if (entry.c4)
    1865             :                         {
    1866             :                             if constexpr (bQuadraticMean)
    1867             :                             {
    1868         800 :                                 nTotalR += SQUARE<int>(entry.c1);
    1869         800 :                                 nTotalG += SQUARE<int>(entry.c2);
    1870         800 :                                 nTotalB += SQUARE<int>(entry.c3);
    1871         800 :                                 ++nCount;
    1872             :                             }
    1873             :                             else
    1874             :                             {
    1875       13328 :                                 nTotalR += entry.c1;
    1876       13328 :                                 nTotalG += entry.c2;
    1877       13328 :                                 nTotalB += entry.c3;
    1878       13328 :                                 ++nCount;
    1879             :                             }
    1880             :                         }
    1881             :                     }
    1882             :                 }
    1883             : 
    1884        6475 :                 if (nCount == 0 ||
    1885           0 :                     (bPropagateNoData &&
    1886           0 :                      nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    1887           0 :                                   (nSrcXOff2 - nSrcXOff)))
    1888             :                 {
    1889        2838 :                     pDstScanline[iDstPixel] = tNoDataValue;
    1890             :                 }
    1891             :                 else
    1892             :                 {
    1893             :                     GDALColorEntry color;
    1894             :                     if constexpr (bQuadraticMean)
    1895             :                     {
    1896         200 :                         color.c1 =
    1897         200 :                             static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
    1898         200 :                         color.c2 =
    1899         200 :                             static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
    1900         200 :                         color.c3 =
    1901         200 :                             static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
    1902             :                     }
    1903             :                     else
    1904             :                     {
    1905        3437 :                         color.c1 =
    1906        3437 :                             static_cast<short>((nTotalR + nCount / 2) / nCount);
    1907        3437 :                         color.c2 =
    1908        3437 :                             static_cast<short>((nTotalG + nCount / 2) / nCount);
    1909        3437 :                         color.c3 =
    1910        3437 :                             static_cast<short>((nTotalB + nCount / 2) / nCount);
    1911             :                     }
    1912        3637 :                     pDstScanline[iDstPixel] =
    1913        3637 :                         static_cast<T>(BestColorEntry(colorEntries, color));
    1914             :                 }
    1915             :             }
    1916             :         }
    1917             :     }
    1918             : 
    1919        7347 :     CPLFree(pasSrcX);
    1920             : 
    1921        7347 :     return CE_None;
    1922             : }
    1923             : 
    1924             : template <bool bQuadraticMean>
    1925             : static CPLErr
    1926        7347 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
    1927             :                                        const void *pChunk, void **ppDstBuffer,
    1928             :                                        GDALDataType *peDstBufferDataType)
    1929             : {
    1930        7347 :     *peDstBufferDataType = args.eWrkDataType;
    1931        7347 :     switch (args.eWrkDataType)
    1932             :     {
    1933        7202 :         case GDT_UInt8:
    1934             :         {
    1935             :             return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_UInt8,
    1936        7202 :                                                     bQuadraticMean>(
    1937        7202 :                 args, static_cast<const GByte *>(pChunk), ppDstBuffer);
    1938             :         }
    1939             : 
    1940          11 :         case GDT_UInt16:
    1941             :         {
    1942             :             if constexpr (bQuadraticMean)
    1943             :             {
    1944             :                 // Use double as accumulation type, because UInt32 could overflow
    1945             :                 return GDALResampleChunk_AverageOrRMS_T<
    1946           6 :                     GUInt16, double, GDT_UInt16, bQuadraticMean>(
    1947           6 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1948             :             }
    1949             :             else
    1950             :             {
    1951             :                 return GDALResampleChunk_AverageOrRMS_T<
    1952           5 :                     GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
    1953           5 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1954             :             }
    1955             :         }
    1956             : 
    1957          81 :         case GDT_Float32:
    1958             :         {
    1959             :             return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
    1960          81 :                                                     bQuadraticMean>(
    1961          81 :                 args, static_cast<const float *>(pChunk), ppDstBuffer);
    1962             :         }
    1963             : 
    1964          53 :         case GDT_Float64:
    1965             :         {
    1966             :             return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
    1967          53 :                                                     bQuadraticMean>(
    1968          53 :                 args, static_cast<const double *>(pChunk), ppDstBuffer);
    1969             :         }
    1970             : 
    1971           0 :         default:
    1972           0 :             break;
    1973             :     }
    1974             : 
    1975           0 :     CPLAssert(false);
    1976             :     return CE_Failure;
    1977             : }
    1978             : 
    1979             : static CPLErr
    1980        7347 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
    1981             :                                const void *pChunk, void **ppDstBuffer,
    1982             :                                GDALDataType *peDstBufferDataType)
    1983             : {
    1984        7347 :     if (EQUAL(args.pszResampling, "RMS"))
    1985          83 :         return GDALResampleChunk_AverageOrRMSInternal<true>(
    1986          83 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    1987             :     else
    1988        7264 :         return GDALResampleChunk_AverageOrRMSInternal<false>(
    1989        7264 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    1990             : }
    1991             : 
    1992             : /************************************************************************/
    1993             : /*                      GDALResampleChunk_Gauss()                       */
    1994             : /************************************************************************/
    1995             : 
    1996          86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
    1997             :                                       const void *pChunk, void **ppDstBuffer,
    1998             :                                       GDALDataType *peDstBufferDataType)
    1999             : 
    2000             : {
    2001          86 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2002          86 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2003          86 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2004          86 :     const int nChunkXOff = args.nChunkXOff;
    2005          86 :     const int nChunkXSize = args.nChunkXSize;
    2006          86 :     const int nChunkYOff = args.nChunkYOff;
    2007          86 :     const int nChunkYSize = args.nChunkYSize;
    2008          86 :     const int nDstXOff = args.nDstXOff;
    2009          86 :     const int nDstXOff2 = args.nDstXOff2;
    2010          86 :     const int nDstYOff = args.nDstYOff;
    2011          86 :     const int nDstYOff2 = args.nDstYOff2;
    2012          86 :     const bool bHasNoData = args.bHasNoData;
    2013          86 :     double dfNoDataValue = args.dfNoDataValue;
    2014          86 :     const GDALColorTable *poColorTable = args.poColorTable;
    2015             : 
    2016          86 :     const double *const padfChunk = static_cast<const double *>(pChunk);
    2017             : 
    2018          86 :     *ppDstBuffer =
    2019          86 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    2020             :                             GDALGetDataTypeSizeBytes(GDT_Float64));
    2021          86 :     if (*ppDstBuffer == nullptr)
    2022             :     {
    2023           0 :         return CE_Failure;
    2024             :     }
    2025          86 :     *peDstBufferDataType = GDT_Float64;
    2026          86 :     double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
    2027             : 
    2028             :     /* -------------------------------------------------------------------- */
    2029             :     /*      Create the filter kernel and allocate scanline buffer.          */
    2030             :     /* -------------------------------------------------------------------- */
    2031          86 :     int nGaussMatrixDim = 3;
    2032             :     const int *panGaussMatrix;
    2033          86 :     constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
    2034          86 :     constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
    2035             :                                         4,  6, 24, 36, 24, 6, 4,  16, 24,
    2036             :                                         16, 4, 1,  4,  6,  4, 1};
    2037          86 :     constexpr int anGaussMatrix7x7[] = {
    2038             :         1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
    2039             :         6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
    2040             :         120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
    2041             :         90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
    2042             : 
    2043          86 :     const int nOXSize = args.nOvrXSize;
    2044          86 :     const int nOYSize = args.nOvrYSize;
    2045          86 :     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    2046             : 
    2047             :     // matrix for gauss filter
    2048          86 :     if (nResYFactor <= 2)
    2049             :     {
    2050          85 :         panGaussMatrix = anGaussMatrix3x3;
    2051          85 :         nGaussMatrixDim = 3;
    2052             :     }
    2053           1 :     else if (nResYFactor <= 4)
    2054             :     {
    2055           0 :         panGaussMatrix = anGaussMatrix5x5;
    2056           0 :         nGaussMatrixDim = 5;
    2057             :     }
    2058             :     else
    2059             :     {
    2060           1 :         panGaussMatrix = anGaussMatrix7x7;
    2061           1 :         nGaussMatrixDim = 7;
    2062             :     }
    2063             : 
    2064             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2065             :     int *panGaussMatrixDup = static_cast<int *>(
    2066             :         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
    2067             :     memcpy(panGaussMatrixDup, panGaussMatrix,
    2068             :            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
    2069             :     panGaussMatrix = panGaussMatrixDup;
    2070             : #endif
    2071             : 
    2072          86 :     if (!bHasNoData)
    2073          79 :         dfNoDataValue = 0.0;
    2074             : 
    2075          86 :     std::vector<GDALColorEntry> colorEntries;
    2076          86 :     int nTransparentIdx = -1;
    2077          86 :     if (poColorTable)
    2078           2 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    2079             : 
    2080             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    2081             :     // it as nodata value.
    2082          92 :     if (bHasNoData && dfNoDataValue >= 0.0 &&
    2083           6 :         dfNoDataValue < colorEntries.size())
    2084           0 :         colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
    2085             : 
    2086             :     // Or if we have no explicit nodata, but a color table entry that is
    2087             :     // transparent, consider it as the nodata value.
    2088          86 :     else if (!bHasNoData && nTransparentIdx >= 0)
    2089             :     {
    2090           0 :         dfNoDataValue = nTransparentIdx;
    2091             :     }
    2092             : 
    2093          86 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2094          86 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2095          86 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    2096             : 
    2097             :     /* ==================================================================== */
    2098             :     /*      Loop over destination scanlines.                                */
    2099             :     /* ==================================================================== */
    2100       16488 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2101             :     {
    2102       16402 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    2103       16402 :         int nSrcYOff2 =
    2104       16402 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
    2105             : 
    2106       16402 :         if (nSrcYOff < nChunkYOff)
    2107             :         {
    2108           0 :             nSrcYOff = nChunkYOff;
    2109           0 :             nSrcYOff2++;
    2110             :         }
    2111             : 
    2112       16402 :         const int iSizeY = nSrcYOff2 - nSrcYOff;
    2113       16402 :         nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
    2114       16402 :         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
    2115             : 
    2116       16402 :         if (nSrcYOff2 > nChunkBottomYOff ||
    2117       16359 :             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
    2118             :         {
    2119          44 :             nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
    2120             :         }
    2121             : 
    2122       16402 :         int nYShiftGaussMatrix = 0;
    2123       16402 :         if (nSrcYOff < nChunkYOff)
    2124             :         {
    2125           0 :             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
    2126           0 :             nSrcYOff = nChunkYOff;
    2127             :         }
    2128             : 
    2129       16402 :         const double *const padfSrcScanline =
    2130       16402 :             padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    2131       16402 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2132       16402 :         if (pabyChunkNodataMask != nullptr)
    2133         152 :             pabySrcScanlineNodataMask =
    2134         152 :                 pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    2135             : 
    2136             :         /* --------------------------------------------------------------------
    2137             :          */
    2138             :         /*      Loop over destination pixels */
    2139             :         /* --------------------------------------------------------------------
    2140             :          */
    2141       16402 :         double *const padfDstScanline =
    2142       16402 :             padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    2143     4149980 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2144             :         {
    2145     4133580 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    2146     4133580 :             int nSrcXOff2 =
    2147     4133580 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
    2148             : 
    2149     4133580 :             if (nSrcXOff < nChunkXOff)
    2150             :             {
    2151           0 :                 nSrcXOff = nChunkXOff;
    2152           0 :                 nSrcXOff2++;
    2153             :             }
    2154             : 
    2155     4133580 :             const int iSizeX = nSrcXOff2 - nSrcXOff;
    2156     4133580 :             nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
    2157     4133580 :             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
    2158             : 
    2159     4133580 :             if (nSrcXOff2 > nChunkRightXOff ||
    2160     4127930 :                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
    2161             :             {
    2162        5650 :                 nSrcXOff2 =
    2163        5650 :                     std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
    2164             :             }
    2165             : 
    2166     4133580 :             int nXShiftGaussMatrix = 0;
    2167     4133580 :             if (nSrcXOff < nChunkXOff)
    2168             :             {
    2169           0 :                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
    2170           0 :                 nSrcXOff = nChunkXOff;
    2171             :             }
    2172             : 
    2173     4133580 :             if (poColorTable == nullptr)
    2174             :             {
    2175     4133380 :                 double dfTotal = 0.0;
    2176     4133380 :                 GInt64 nCount = 0;
    2177     4133380 :                 const int *panLineWeight =
    2178     4133380 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2179             :                     nXShiftGaussMatrix;
    2180             : 
    2181    16527900 :                 for (int iY = nSrcYOff; iY < nSrcYOff2;
    2182    12394500 :                      ++iY, panLineWeight += nGaussMatrixDim)
    2183             :                 {
    2184    49561300 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2185             :                     {
    2186    37166800 :                         const double val =
    2187    37166800 :                             padfSrcScanline[iX - nChunkXOff +
    2188    37166800 :                                             static_cast<GPtrDiff_t>(iY -
    2189    37166800 :                                                                     nSrcYOff) *
    2190    37166800 :                                                 nChunkXSize];
    2191    37166800 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2192       32872 :                             pabySrcScanlineNodataMask[iX - nChunkXOff +
    2193       32872 :                                                       static_cast<GPtrDiff_t>(
    2194       32872 :                                                           iY - nSrcYOff) *
    2195       32872 :                                                           nChunkXSize])
    2196             :                         {
    2197    37146100 :                             const int nWeight = panLineWeight[i];
    2198    37146100 :                             dfTotal += val * nWeight;
    2199    37146100 :                             nCount += nWeight;
    2200             :                         }
    2201             :                     }
    2202             :                 }
    2203             : 
    2204     4133380 :                 if (nCount == 0)
    2205             :                 {
    2206        2217 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2207             :                 }
    2208             :                 else
    2209             :                 {
    2210     4131160 :                     padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
    2211             :                 }
    2212             :             }
    2213             :             else
    2214             :             {
    2215         200 :                 GInt64 nTotalR = 0;
    2216         200 :                 GInt64 nTotalG = 0;
    2217         200 :                 GInt64 nTotalB = 0;
    2218         200 :                 GInt64 nTotalWeight = 0;
    2219         200 :                 const int *panLineWeight =
    2220         200 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2221             :                     nXShiftGaussMatrix;
    2222             : 
    2223         780 :                 for (int iY = nSrcYOff; iY < nSrcYOff2;
    2224         580 :                      ++iY, panLineWeight += nGaussMatrixDim)
    2225             :                 {
    2226        2262 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2227             :                     {
    2228        1682 :                         const double val =
    2229        1682 :                             padfSrcScanline[iX - nChunkXOff +
    2230        1682 :                                             static_cast<GPtrDiff_t>(iY -
    2231        1682 :                                                                     nSrcYOff) *
    2232        1682 :                                                 nChunkXSize];
    2233        1682 :                         if (val < 0 || val >= colorEntries.size())
    2234           0 :                             continue;
    2235             : 
    2236        1682 :                         size_t idx = static_cast<size_t>(val);
    2237        1682 :                         if (colorEntries[idx].c4)
    2238             :                         {
    2239        1682 :                             const int nWeight = panLineWeight[i];
    2240        1682 :                             nTotalR +=
    2241        1682 :                                 static_cast<GInt64>(colorEntries[idx].c1) *
    2242        1682 :                                 nWeight;
    2243        1682 :                             nTotalG +=
    2244        1682 :                                 static_cast<GInt64>(colorEntries[idx].c2) *
    2245        1682 :                                 nWeight;
    2246        1682 :                             nTotalB +=
    2247        1682 :                                 static_cast<GInt64>(colorEntries[idx].c3) *
    2248        1682 :                                 nWeight;
    2249        1682 :                             nTotalWeight += nWeight;
    2250             :                         }
    2251             :                     }
    2252             :                 }
    2253             : 
    2254         200 :                 if (nTotalWeight == 0)
    2255             :                 {
    2256           0 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2257             :                 }
    2258             :                 else
    2259             :                 {
    2260             :                     GDALColorEntry color;
    2261             : 
    2262         200 :                     color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
    2263             :                                                   nTotalWeight);
    2264         200 :                     color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
    2265             :                                                   nTotalWeight);
    2266         200 :                     color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
    2267             :                                                   nTotalWeight);
    2268         200 :                     padfDstScanline[iDstPixel - nDstXOff] =
    2269         200 :                         BestColorEntry(colorEntries, color);
    2270             :                 }
    2271             :             }
    2272             :         }
    2273             :     }
    2274             : 
    2275             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2276             :     CPLFree(panGaussMatrixDup);
    2277             : #endif
    2278             : 
    2279          86 :     return CE_None;
    2280             : }
    2281             : 
    2282             : /************************************************************************/
    2283             : /*                       GDALResampleChunk_Mode()                       */
    2284             : /************************************************************************/
    2285             : 
    2286         688 : template <class T> static inline bool IsSame(T a, T b)
    2287             : {
    2288         688 :     return a == b;
    2289             : }
    2290             : 
    2291          60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
    2292             : {
    2293          60 :     return a == b || (CPLIsNan(a) && CPLIsNan(b));
    2294             : }
    2295             : 
    2296        5583 : template <> bool IsSame<float>(float a, float b)
    2297             : {
    2298        5583 :     return a == b || (std::isnan(a) && std::isnan(b));
    2299             : }
    2300             : 
    2301        1701 : template <> bool IsSame<double>(double a, double b)
    2302             : {
    2303        1701 :     return a == b || (std::isnan(a) && std::isnan(b));
    2304             : }
    2305             : 
    2306             : namespace
    2307             : {
    2308             : struct ComplexFloat16
    2309             : {
    2310             :     GFloat16 r;
    2311             :     GFloat16 i;
    2312             : };
    2313             : }  // namespace
    2314             : 
    2315          60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
    2316             : {
    2317          90 :     return (a.r == b.r && a.i == b.i) ||
    2318          90 :            (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
    2319             : }
    2320             : 
    2321             : template <>
    2322          60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
    2323             : {
    2324         120 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2325         120 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2326             : }
    2327             : 
    2328             : template <>
    2329          60 : bool IsSame<std::complex<double>>(std::complex<double> a,
    2330             :                                   std::complex<double> b)
    2331             : {
    2332         120 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2333         120 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2334             : }
    2335             : 
    2336             : template <class T>
    2337         182 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
    2338             :                                       const T *pChunk, T *const pDstBuffer)
    2339             : 
    2340             : {
    2341         182 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2342         182 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2343         182 :     const double dfSrcXDelta = args.dfSrcXDelta;
    2344         182 :     const double dfSrcYDelta = args.dfSrcYDelta;
    2345         182 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2346         182 :     const int nChunkXOff = args.nChunkXOff;
    2347         182 :     const int nChunkXSize = args.nChunkXSize;
    2348         182 :     const int nChunkYOff = args.nChunkYOff;
    2349         182 :     const int nChunkYSize = args.nChunkYSize;
    2350         182 :     const int nDstXOff = args.nDstXOff;
    2351         182 :     const int nDstXOff2 = args.nDstXOff2;
    2352         182 :     const int nDstYOff = args.nDstYOff;
    2353         182 :     const int nDstYOff2 = args.nDstYOff2;
    2354         182 :     const bool bHasNoData = args.bHasNoData;
    2355         182 :     const GDALColorTable *poColorTable = args.poColorTable;
    2356         182 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    2357             : 
    2358           8 :     T tNoDataValue;
    2359             :     if constexpr (std::is_same<T, ComplexFloat16>::value)
    2360             :     {
    2361           4 :         tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
    2362           4 :         tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
    2363             :     }
    2364             :     else if constexpr (std::is_same<T, std::complex<float>>::value ||
    2365             :                        std::is_same<T, std::complex<double>>::value)
    2366             :     {
    2367             :         using BaseT = typename T::value_type;
    2368           8 :         tNoDataValue =
    2369             :             std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
    2370             :                                 std::numeric_limits<BaseT>::quiet_NaN());
    2371             :     }
    2372         170 :     else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
    2373         169 :         tNoDataValue = 0;
    2374             :     else
    2375           1 :         tNoDataValue = static_cast<T>(args.dfNoDataValue);
    2376             : 
    2377             :     using CountType = uint32_t;
    2378         182 :     CountType nMaxNumPx = 0;
    2379         182 :     T *paVals = nullptr;
    2380         182 :     CountType *panCounts = nullptr;
    2381             : 
    2382         182 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2383         182 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2384         364 :     std::vector<int> anVals(256, 0);
    2385             : 
    2386             :     /* ==================================================================== */
    2387             :     /*      Loop over destination scanlines.                                */
    2388             :     /* ==================================================================== */
    2389        7713 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2390             :     {
    2391        7531 :         const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    2392        7531 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    2393             : #ifdef only_pixels_with_more_than_10_pct_participation
    2394             :         // When oversampling, don't take into account pixels that have a tiny
    2395             :         // participation in the resulting pixel
    2396             :         if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
    2397             :             nSrcYOff < nChunkBottomYOff)
    2398             :             nSrcYOff++;
    2399             : #endif
    2400        7531 :         if (nSrcYOff < nChunkYOff)
    2401           0 :             nSrcYOff = nChunkYOff;
    2402             : 
    2403        7531 :         const double dfSrcYOff2 =
    2404        7531 :             dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    2405        7531 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    2406             : #ifdef only_pixels_with_more_than_10_pct_participation
    2407             :         // When oversampling, don't take into account pixels that have a tiny
    2408             :         // participation in the resulting pixel
    2409             :         if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
    2410             :             nSrcYOff2 > nChunkYOff)
    2411             :             nSrcYOff2--;
    2412             : #endif
    2413        7531 :         if (nSrcYOff2 == nSrcYOff)
    2414           0 :             ++nSrcYOff2;
    2415        7531 :         if (nSrcYOff2 > nChunkBottomYOff)
    2416           0 :             nSrcYOff2 = nChunkBottomYOff;
    2417             : 
    2418        7531 :         const T *const paSrcScanline =
    2419         281 :             pChunk +
    2420        7531 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
    2421        7531 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2422        7531 :         if (pabyChunkNodataMask != nullptr)
    2423        1838 :             pabySrcScanlineNodataMask =
    2424             :                 pabyChunkNodataMask +
    2425        1838 :                 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
    2426             : 
    2427        7531 :         T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
    2428             :         /* --------------------------------------------------------------------
    2429             :          */
    2430             :         /*      Loop over destination pixels */
    2431             :         /* --------------------------------------------------------------------
    2432             :          */
    2433     4260596 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2434             :         {
    2435     4253061 :             const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    2436             :             // Apply some epsilon to avoid numerical precision issues
    2437     4253061 :             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    2438             : #ifdef only_pixels_with_more_than_10_pct_participation
    2439             :             // When oversampling, don't take into account pixels that have a
    2440             :             // tiny participation in the resulting pixel
    2441             :             if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
    2442             :                 nSrcXOff < nChunkRightXOff)
    2443             :                 nSrcXOff++;
    2444             : #endif
    2445     4253061 :             if (nSrcXOff < nChunkXOff)
    2446           0 :                 nSrcXOff = nChunkXOff;
    2447             : 
    2448     4253061 :             const double dfSrcXOff2 =
    2449     4253061 :                 dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    2450     4253061 :             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    2451             : #ifdef only_pixels_with_more_than_10_pct_participation
    2452             :             // When oversampling, don't take into account pixels that have a
    2453             :             // tiny participation in the resulting pixel
    2454             :             if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
    2455             :                 nSrcXOff2 > nChunkXOff)
    2456             :                 nSrcXOff2--;
    2457             : #endif
    2458     4253061 :             if (nSrcXOff2 == nSrcXOff)
    2459           0 :                 nSrcXOff2++;
    2460     4253061 :             if (nSrcXOff2 > nChunkRightXOff)
    2461           0 :                 nSrcXOff2 = nChunkRightXOff;
    2462             : 
    2463     4253061 :             bool bRegularProcessing = false;
    2464             :             if constexpr (!std::is_same<T, GByte>::value)
    2465        1671 :                 bRegularProcessing = true;
    2466     4251390 :             else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
    2467           0 :                 bRegularProcessing = true;
    2468             : 
    2469     4253061 :             if (bRegularProcessing)
    2470             :             {
    2471             :                 // Sanity check to make sure the allocation of paVals and
    2472             :                 // panCounts don't overflow.
    2473             :                 static_assert(sizeof(CountType) <= sizeof(size_t));
    2474        3342 :                 if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
    2475        1671 :                     static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
    2476        1671 :                         (std::numeric_limits<CountType>::max() /
    2477        3342 :                          std::max(sizeof(T), sizeof(CountType))) /
    2478        1671 :                             static_cast<CountType>(nSrcXOff2 - nSrcXOff))
    2479             :                 {
    2480           0 :                     CPLError(CE_Failure, CPLE_NotSupported,
    2481             :                              "Too big downsampling factor");
    2482           0 :                     CPLFree(paVals);
    2483           0 :                     CPLFree(panCounts);
    2484           0 :                     return CE_Failure;
    2485             :                 }
    2486        1671 :                 const CountType nNumPx =
    2487        1671 :                     static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
    2488        1671 :                     (nSrcXOff2 - nSrcXOff);
    2489        1671 :                 CountType iMaxInd = 0;
    2490        1671 :                 CountType iMaxVal = 0;
    2491             : 
    2492        1671 :                 if (paVals == nullptr || nNumPx > nMaxNumPx)
    2493             :                 {
    2494             :                     T *paValsNew = static_cast<T *>(
    2495         116 :                         VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
    2496             :                     CountType *panCountsNew =
    2497         116 :                         static_cast<CountType *>(VSI_REALLOC_VERBOSE(
    2498             :                             panCounts, nNumPx * sizeof(CountType)));
    2499         116 :                     if (paValsNew != nullptr)
    2500         116 :                         paVals = paValsNew;
    2501         116 :                     if (panCountsNew != nullptr)
    2502         116 :                         panCounts = panCountsNew;
    2503         116 :                     if (paValsNew == nullptr || panCountsNew == nullptr)
    2504             :                     {
    2505           0 :                         CPLFree(paVals);
    2506           0 :                         CPLFree(panCounts);
    2507           0 :                         return CE_Failure;
    2508             :                     }
    2509         116 :                     nMaxNumPx = nNumPx;
    2510             :                 }
    2511             : 
    2512        5245 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2513             :                 {
    2514        3574 :                     const GPtrDiff_t iTotYOff =
    2515        3574 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2516        3574 :                         nChunkXOff;
    2517       11842 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2518             :                     {
    2519        8268 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2520        1552 :                             pabySrcScanlineNodataMask[iX + iTotYOff])
    2521             :                         {
    2522        8247 :                             const T val = paSrcScanline[iX + iTotYOff];
    2523        8247 :                             CountType i = 0;  // Used after for.
    2524             : 
    2525             :                             // Check array for existing entry.
    2526       11611 :                             for (; i < iMaxInd; ++i)
    2527             :                             {
    2528        8212 :                                 if (IsSame(paVals[i], val))
    2529             :                                 {
    2530        4848 :                                     if (++panCounts[i] > panCounts[iMaxVal])
    2531             :                                     {
    2532         246 :                                         iMaxVal = i;
    2533             :                                     }
    2534        4848 :                                     break;
    2535             :                                 }
    2536             :                             }
    2537             : 
    2538             :                             // Add to arr if entry not already there.
    2539        8247 :                             if (i == iMaxInd)
    2540             :                             {
    2541        3399 :                                 paVals[iMaxInd] = val;
    2542        3399 :                                 panCounts[iMaxInd] = 1;
    2543             : 
    2544        3399 :                                 if (iMaxInd == 0)
    2545             :                                 {
    2546        1668 :                                     iMaxVal = iMaxInd;
    2547             :                                 }
    2548             : 
    2549        3399 :                                 ++iMaxInd;
    2550             :                             }
    2551             :                         }
    2552             :                     }
    2553             :                 }
    2554             : 
    2555        1671 :                 if (iMaxInd == 0)
    2556           3 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2557             :                 else
    2558        1668 :                     paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
    2559             :             }
    2560             :             else if constexpr (std::is_same<T, GByte>::value)
    2561             :             // ( eSrcDataType == GDT_UInt8 && nEntryCount < 256 )
    2562             :             {
    2563             :                 // So we go here for a paletted or non-paletted byte band.
    2564             :                 // The input values are then between 0 and 255.
    2565     4251390 :                 int nMaxVal = 0;
    2566     4251390 :                 int iMaxInd = -1;
    2567             : 
    2568             :                 // The cost of this zeroing might be high. Perhaps we should
    2569             :                 // just use the above generic case, and go to this one if the
    2570             :                 // number of source pixels is large enough
    2571     4251390 :                 std::fill(anVals.begin(), anVals.end(), 0);
    2572             : 
    2573    12777800 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2574             :                 {
    2575     8526440 :                     const GPtrDiff_t iTotYOff =
    2576     8526440 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2577     8526440 :                         nChunkXOff;
    2578    25649600 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2579             :                     {
    2580    17123100 :                         const T val = paSrcScanline[iX + iTotYOff];
    2581    17123100 :                         if (!bHasNoData || val != tNoDataValue)
    2582             :                         {
    2583    17123100 :                             int nVal = static_cast<int>(val);
    2584    17123100 :                             if (++anVals[nVal] > nMaxVal)
    2585             :                             {
    2586             :                                 // Sum the density.
    2587             :                                 // Is it the most common value so far?
    2588    17006400 :                                 iMaxInd = nVal;
    2589    17006400 :                                 nMaxVal = anVals[nVal];
    2590             :                             }
    2591             :                         }
    2592             :                     }
    2593             :                 }
    2594             : 
    2595     4251390 :                 if (iMaxInd == -1)
    2596           0 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2597             :                 else
    2598     4251390 :                     paDstScanline[iDstPixel - nDstXOff] =
    2599             :                         static_cast<T>(iMaxInd);
    2600             :             }
    2601             :         }
    2602             :     }
    2603             : 
    2604         182 :     CPLFree(paVals);
    2605         182 :     CPLFree(panCounts);
    2606             : 
    2607         182 :     return CE_None;
    2608             : }
    2609             : 
    2610         182 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
    2611             :                                      const void *pChunk, void **ppDstBuffer,
    2612             :                                      GDALDataType *peDstBufferDataType)
    2613             : {
    2614         182 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    2615             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    2616             :         GDALGetDataTypeSizeBytes(args.eWrkDataType));
    2617         182 :     if (*ppDstBuffer == nullptr)
    2618             :     {
    2619           0 :         return CE_Failure;
    2620             :     }
    2621             : 
    2622         182 :     CPLAssert(args.eSrcDataType == args.eWrkDataType);
    2623             : 
    2624         182 :     *peDstBufferDataType = args.eWrkDataType;
    2625         182 :     switch (args.eWrkDataType)
    2626             :     {
    2627             :         // For mode resampling, as no computation is done, only the
    2628             :         // size of the data type matters... except for Byte where we have
    2629             :         // special processing. And for floating point values
    2630          66 :         case GDT_UInt8:
    2631             :         {
    2632          66 :             return GDALResampleChunk_ModeT(args,
    2633             :                                            static_cast<const GByte *>(pChunk),
    2634          66 :                                            static_cast<GByte *>(*ppDstBuffer));
    2635             :         }
    2636             : 
    2637           4 :         case GDT_Int8:
    2638             :         {
    2639           4 :             return GDALResampleChunk_ModeT(args,
    2640             :                                            static_cast<const int8_t *>(pChunk),
    2641           4 :                                            static_cast<int8_t *>(*ppDstBuffer));
    2642             :         }
    2643             : 
    2644          10 :         case GDT_Int16:
    2645             :         case GDT_UInt16:
    2646             :         {
    2647          10 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
    2648          10 :             return GDALResampleChunk_ModeT(
    2649             :                 args, static_cast<const uint16_t *>(pChunk),
    2650          10 :                 static_cast<uint16_t *>(*ppDstBuffer));
    2651             :         }
    2652             : 
    2653          15 :         case GDT_CInt16:
    2654             :         case GDT_Int32:
    2655             :         case GDT_UInt32:
    2656             :         {
    2657          15 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2658          15 :             return GDALResampleChunk_ModeT(
    2659             :                 args, static_cast<const uint32_t *>(pChunk),
    2660          15 :                 static_cast<uint32_t *>(*ppDstBuffer));
    2661             :         }
    2662             : 
    2663          12 :         case GDT_CInt32:
    2664             :         case GDT_Int64:
    2665             :         case GDT_UInt64:
    2666             :         {
    2667          12 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2668          12 :             return GDALResampleChunk_ModeT(
    2669             :                 args, static_cast<const uint64_t *>(pChunk),
    2670          12 :                 static_cast<uint64_t *>(*ppDstBuffer));
    2671             :         }
    2672             : 
    2673           4 :         case GDT_Float16:
    2674             :         {
    2675           4 :             return GDALResampleChunk_ModeT(
    2676             :                 args, static_cast<const GFloat16 *>(pChunk),
    2677           4 :                 static_cast<GFloat16 *>(*ppDstBuffer));
    2678             :         }
    2679             : 
    2680          35 :         case GDT_Float32:
    2681             :         {
    2682          35 :             return GDALResampleChunk_ModeT(args,
    2683             :                                            static_cast<const float *>(pChunk),
    2684          35 :                                            static_cast<float *>(*ppDstBuffer));
    2685             :         }
    2686             : 
    2687          24 :         case GDT_Float64:
    2688             :         {
    2689          24 :             return GDALResampleChunk_ModeT(args,
    2690             :                                            static_cast<const double *>(pChunk),
    2691          24 :                                            static_cast<double *>(*ppDstBuffer));
    2692             :         }
    2693             : 
    2694           4 :         case GDT_CFloat16:
    2695             :         {
    2696           4 :             return GDALResampleChunk_ModeT(
    2697             :                 args, static_cast<const ComplexFloat16 *>(pChunk),
    2698           4 :                 static_cast<ComplexFloat16 *>(*ppDstBuffer));
    2699             :         }
    2700             : 
    2701           4 :         case GDT_CFloat32:
    2702             :         {
    2703           4 :             return GDALResampleChunk_ModeT(
    2704             :                 args, static_cast<const std::complex<float> *>(pChunk),
    2705           4 :                 static_cast<std::complex<float> *>(*ppDstBuffer));
    2706             :         }
    2707             : 
    2708           4 :         case GDT_CFloat64:
    2709             :         {
    2710           4 :             return GDALResampleChunk_ModeT(
    2711             :                 args, static_cast<const std::complex<double> *>(pChunk),
    2712           4 :                 static_cast<std::complex<double> *>(*ppDstBuffer));
    2713             :         }
    2714             : 
    2715           0 :         case GDT_Unknown:
    2716             :         case GDT_TypeCount:
    2717           0 :             break;
    2718             :     }
    2719             : 
    2720           0 :     CPLAssert(false);
    2721             :     return CE_Failure;
    2722             : }
    2723             : 
    2724             : /************************************************************************/
    2725             : /*                 GDALResampleConvolutionHorizontal()                  */
    2726             : /************************************************************************/
    2727             : 
    2728             : template <class T>
    2729             : static inline double
    2730       46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
    2731             :                                   int nSrcPixelCount)
    2732             : {
    2733       46038 :     double dfVal1 = 0.0;
    2734       46038 :     double dfVal2 = 0.0;
    2735       46038 :     int i = 0;  // Used after for.
    2736             :     // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
    2737             :     // manually (untypical) unrolled loop in -O2 and -O3:
    2738             :     // https://github.com/OSGeo/gdal/issues/9508
    2739             : #if !defined(__INTEL_CLANG_COMPILER)
    2740       92396 :     for (; i < nSrcPixelCount - 3; i += 4)
    2741             :     {
    2742       46358 :         dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
    2743       46358 :         dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
    2744       46358 :         dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
    2745       46358 :         dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
    2746             :     }
    2747             : #endif
    2748       48662 :     for (; i < nSrcPixelCount; ++i)
    2749             :     {
    2750        2624 :         dfVal1 += double(pChunk[i]) * padfWeights[i];
    2751             :     }
    2752       46038 :     return dfVal1 + dfVal2;
    2753             : }
    2754             : 
    2755             : template <class T, bool bHasNaN>
    2756       46368 : static inline void GDALResampleConvolutionHorizontalWithMask(
    2757             :     const T *pChunk, const GByte *pabyMask, const double *padfWeights,
    2758             :     int nSrcPixelCount, double &dfWeightValMaskSum, double &dfWeightMaskSum,
    2759             :     double &dfWeightSum)
    2760             : {
    2761       46368 :     dfWeightValMaskSum = 0;
    2762       46368 :     dfWeightMaskSum = 0;
    2763       46368 :     dfWeightSum = 0;
    2764       46368 :     int i = 0;
    2765      103804 :     for (; i < nSrcPixelCount - 3; i += 4)
    2766             :     {
    2767       57436 :         double dfWeightMask0 = padfWeights[i + 0] * pabyMask[i + 0];
    2768       57436 :         double dfWeightMask1 = padfWeights[i + 1] * pabyMask[i + 1];
    2769       57436 :         double dfWeightMask2 = padfWeights[i + 2] * pabyMask[i + 2];
    2770       57436 :         double dfWeightMask3 = padfWeights[i + 3] * pabyMask[i + 3];
    2771             : 
    2772      229744 :         const auto MulNaNAware = [](double v, double &w, double &val)
    2773             :         {
    2774             :             if constexpr (bHasNaN)
    2775             :             {
    2776       14848 :                 if (std::isnan(v))
    2777             :                 {
    2778          76 :                     w = 0;
    2779          76 :                     return;
    2780             :                 }
    2781             :             }
    2782       14772 :             val += v * w;
    2783             :         };
    2784             : 
    2785       57436 :         MulNaNAware(double(pChunk[i + 0]), dfWeightMask0, dfWeightValMaskSum);
    2786       57436 :         MulNaNAware(double(pChunk[i + 1]), dfWeightMask1, dfWeightValMaskSum);
    2787       57436 :         MulNaNAware(double(pChunk[i + 2]), dfWeightMask2, dfWeightValMaskSum);
    2788       57436 :         MulNaNAware(double(pChunk[i + 3]), dfWeightMask3, dfWeightValMaskSum);
    2789       57436 :         dfWeightMaskSum +=
    2790       57436 :             dfWeightMask0 + dfWeightMask1 + dfWeightMask2 + dfWeightMask3;
    2791       57436 :         dfWeightSum += padfWeights[i + 0] + padfWeights[i + 1] +
    2792       57436 :                        padfWeights[i + 2] + padfWeights[i + 3];
    2793             :     }
    2794       64874 :     for (; i < nSrcPixelCount; ++i)
    2795             :     {
    2796       18506 :         const double dfWeightMask = padfWeights[i] * pabyMask[i];
    2797             :         if constexpr (bHasNaN)
    2798             :         {
    2799        1920 :             if (!std::isnan(pChunk[i]))
    2800             :             {
    2801        1920 :                 dfWeightValMaskSum += double(pChunk[i]) * dfWeightMask;
    2802        1920 :                 dfWeightMaskSum += dfWeightMask;
    2803        1920 :                 dfWeightSum += padfWeights[i];
    2804             :             }
    2805             :         }
    2806             :         else
    2807             :         {
    2808       16586 :             dfWeightValMaskSum += double(pChunk[i]) * dfWeightMask;
    2809       16586 :             dfWeightMaskSum += dfWeightMask;
    2810       16586 :             dfWeightSum += padfWeights[i];
    2811             :         }
    2812             :     }
    2813       46368 : }
    2814             : 
    2815             : template <class T, bool bHasNaN>
    2816     1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
    2817             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2818             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2819             :     double &dfRes2, double &dfRes3)
    2820             : {
    2821     1341366 :     double dfVal1 = 0.0;
    2822     1341366 :     double dfVal2 = 0.0;
    2823     1341366 :     double dfVal3 = 0.0;
    2824     1341366 :     double dfVal4 = 0.0;
    2825     1341366 :     double dfVal5 = 0.0;
    2826     1341366 :     double dfVal6 = 0.0;
    2827     1341366 :     int i = 0;  // Used after for.
    2828             : 
    2829    16866840 :     const auto MulNaNAware = [](double a, double w)
    2830             :     {
    2831             :         if constexpr (bHasNaN)
    2832             :         {
    2833           0 :             if (std::isnan(a))
    2834           0 :                 return 0.0;
    2835             :         }
    2836    16866900 :         return a * w;
    2837             :     };
    2838             : 
    2839     2736937 :     for (; i < nSrcPixelCount - 3; i += 4)
    2840             :     {
    2841     1395570 :         dfVal1 += MulNaNAware(double(pChunkRow1[i + 0]), padfWeights[i + 0]);
    2842     1395570 :         dfVal1 += MulNaNAware(double(pChunkRow1[i + 1]), padfWeights[i + 1]);
    2843     1395570 :         dfVal2 += MulNaNAware(double(pChunkRow1[i + 2]), padfWeights[i + 2]);
    2844     1395570 :         dfVal2 += MulNaNAware(double(pChunkRow1[i + 3]), padfWeights[i + 3]);
    2845     1395570 :         dfVal3 += MulNaNAware(double(pChunkRow2[i + 0]), padfWeights[i + 0]);
    2846     1395570 :         dfVal3 += MulNaNAware(double(pChunkRow2[i + 1]), padfWeights[i + 1]);
    2847     1395570 :         dfVal4 += MulNaNAware(double(pChunkRow2[i + 2]), padfWeights[i + 2]);
    2848     1395570 :         dfVal4 += MulNaNAware(double(pChunkRow2[i + 3]), padfWeights[i + 3]);
    2849     1395570 :         dfVal5 += MulNaNAware(double(pChunkRow3[i + 0]), padfWeights[i + 0]);
    2850     1395570 :         dfVal5 += MulNaNAware(double(pChunkRow3[i + 1]), padfWeights[i + 1]);
    2851     1395570 :         dfVal6 += MulNaNAware(double(pChunkRow3[i + 2]), padfWeights[i + 2]);
    2852     1395570 :         dfVal6 += MulNaNAware(double(pChunkRow3[i + 3]), padfWeights[i + 3]);
    2853             :     }
    2854     1381377 :     for (; i < nSrcPixelCount; ++i)
    2855             :     {
    2856       40011 :         dfVal1 += MulNaNAware(double(pChunkRow1[i]), padfWeights[i]);
    2857       40011 :         dfVal3 += MulNaNAware(double(pChunkRow2[i]), padfWeights[i]);
    2858       40011 :         dfVal5 += MulNaNAware(double(pChunkRow3[i]), padfWeights[i]);
    2859             :     }
    2860     1341366 :     dfRes1 = dfVal1 + dfVal2;
    2861     1341366 :     dfRes2 = dfVal3 + dfVal4;
    2862     1341366 :     dfRes3 = dfVal5 + dfVal6;
    2863     1341366 : }
    2864             : 
    2865             : template <class T, bool bHasNaN>
    2866       18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    2867             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2868             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2869             :     double &dfRes2, double &dfRes3)
    2870             : {
    2871       18980 :     GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
    2872             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, nSrcPixelCount, dfRes1,
    2873             :         dfRes2, dfRes3);
    2874       18980 : }
    2875             : 
    2876             : template <class T, bool bHasNaN>
    2877     1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
    2878             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2879             :     const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
    2880             : {
    2881     1256690 :     GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
    2882             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, 4, dfRes1, dfRes2,
    2883             :         dfRes3);
    2884     1256690 : }
    2885             : 
    2886             : /************************************************************************/
    2887             : /*                  GDALResampleConvolutionVertical()                   */
    2888             : /************************************************************************/
    2889             : 
    2890             : template <class T>
    2891             : static inline double
    2892      472545 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
    2893             :                                 const double *padfWeights, int nSrcLineCount)
    2894             : {
    2895      472545 :     double dfVal1 = 0.0;
    2896      472545 :     double dfVal2 = 0.0;
    2897      472545 :     int i = 0;
    2898      472545 :     size_t j = 0;
    2899      936186 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2900             :     {
    2901      463641 :         dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
    2902      463641 :         dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
    2903      463641 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2904      463641 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2905             :     }
    2906      526884 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2907             :     {
    2908       54339 :         dfVal1 += pChunk[j] * padfWeights[i];
    2909             :     }
    2910      472545 :     return dfVal1 + dfVal2;
    2911             : }
    2912             : 
    2913             : template <class T>
    2914     2930610 : static inline void GDALResampleConvolutionVertical_2cols(
    2915             :     const T *pChunk, size_t nStride, const double *padfWeights,
    2916             :     int nSrcLineCount, double &dfRes1, double &dfRes2)
    2917             : {
    2918     2930610 :     double dfVal1 = 0.0;
    2919     2930610 :     double dfVal2 = 0.0;
    2920     2930610 :     double dfVal3 = 0.0;
    2921     2930610 :     double dfVal4 = 0.0;
    2922     2930610 :     int i = 0;
    2923     2930610 :     size_t j = 0;
    2924     5863170 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2925             :     {
    2926     2932560 :         dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
    2927     2932560 :         dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
    2928     2932560 :         dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
    2929     2932560 :         dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
    2930     2932560 :         dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
    2931     2932560 :         dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
    2932     2932560 :         dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
    2933     2932560 :         dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
    2934             :     }
    2935     3053490 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2936             :     {
    2937      122880 :         dfVal1 += pChunk[j + 0] * padfWeights[i];
    2938      122880 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2939             :     }
    2940     2930610 :     dfRes1 = dfVal1 + dfVal2;
    2941     2930610 :     dfRes2 = dfVal3 + dfVal4;
    2942     2930610 : }
    2943             : 
    2944             : #ifdef USE_SSE2
    2945             : 
    2946             : #ifdef __AVX__
    2947             : /************************************************************************/
    2948             : /*              GDALResampleConvolutionVertical_16cols<T>               */
    2949             : /************************************************************************/
    2950             : 
    2951             : template <class T>
    2952             : static inline void
    2953             : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
    2954             :                                        const double *padfWeights,
    2955             :                                        int nSrcLineCount, float *afDest)
    2956             : {
    2957             :     int i = 0;
    2958             :     size_t j = 0;
    2959             :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2960             :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2961             :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2962             :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2963             :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2964             :     {
    2965             :         XMMReg4Double w0 =
    2966             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2967             :         XMMReg4Double w1 =
    2968             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2969             :         XMMReg4Double w2 =
    2970             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2971             :         XMMReg4Double w3 =
    2972             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2973             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2974             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2975             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
    2976             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
    2977             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2978             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2979             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
    2980             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
    2981             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2982             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2983             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
    2984             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
    2985             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2986             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2987             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
    2988             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
    2989             :     }
    2990             :     for (; i < nSrcLineCount; ++i, j += nStride)
    2991             :     {
    2992             :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2993             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2994             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2995             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
    2996             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
    2997             :     }
    2998             :     v_acc0.Store4Val(afDest);
    2999             :     v_acc1.Store4Val(afDest + 4);
    3000             :     v_acc2.Store4Val(afDest + 8);
    3001             :     v_acc3.Store4Val(afDest + 12);
    3002             : }
    3003             : 
    3004             : template <class T>
    3005             : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
    3006             :                                                           const double *, int,
    3007             :                                                           double *)
    3008             : {
    3009             :     // Cannot be reached
    3010             :     CPLAssert(false);
    3011             : }
    3012             : 
    3013             : #else
    3014             : 
    3015             : /************************************************************************/
    3016             : /*               GDALResampleConvolutionVertical_8cols<T>               */
    3017             : /************************************************************************/
    3018             : 
    3019             : template <class T>
    3020             : static inline void
    3021    25804000 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
    3022             :                                       const double *padfWeights,
    3023             :                                       int nSrcLineCount, float *afDest)
    3024             : {
    3025    25804000 :     int i = 0;
    3026    25804000 :     size_t j = 0;
    3027    25804000 :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    3028    25804000 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3029    53883400 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    3030             :     {
    3031    28079400 :         XMMReg4Double w0 =
    3032    28079400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    3033    28079400 :         XMMReg4Double w1 =
    3034    28079400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    3035    28079400 :         XMMReg4Double w2 =
    3036    28079400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    3037    28079400 :         XMMReg4Double w3 =
    3038    28079400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    3039    28079400 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    3040    28079400 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    3041    28079400 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    3042    28079400 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    3043    28079400 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    3044    28079400 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    3045    28079400 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    3046    28079400 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    3047             :     }
    3048    37376100 :     for (; i < nSrcLineCount; ++i, j += nStride)
    3049             :     {
    3050    11572100 :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    3051    11572100 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    3052    11572100 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    3053             :     }
    3054    25804000 :     v_acc0.Store4Val(afDest);
    3055    25804000 :     v_acc1.Store4Val(afDest + 4);
    3056    25804000 : }
    3057             : 
    3058             : template <class T>
    3059             : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
    3060             :                                                          const double *, int,
    3061             :                                                          double *)
    3062             : {
    3063             :     // Cannot be reached
    3064             :     CPLAssert(false);
    3065             : }
    3066             : 
    3067             : #endif  // __AVX__
    3068             : 
    3069             : /************************************************************************/
    3070             : /*               GDALResampleConvolutionHorizontalSSE2<T>               */
    3071             : /************************************************************************/
    3072             : 
    3073             : template <class T>
    3074     3375702 : static inline double GDALResampleConvolutionHorizontalSSE2(
    3075             :     const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3076             : {
    3077     3375702 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3078     3375702 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    3079     3375702 :     int i = 0;  // Used after for.
    3080     3754648 :     for (; i < nSrcPixelCount - 7; i += 8)
    3081             :     {
    3082             :         // Retrieve the pixel & accumulate
    3083      378952 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
    3084      378952 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
    3085      378952 :         const XMMReg4Double v_weight1 =
    3086      378952 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3087      378952 :         const XMMReg4Double v_weight2 =
    3088      378952 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    3089             : 
    3090      378952 :         v_acc1 += v_pixels1 * v_weight1;
    3091      378952 :         v_acc2 += v_pixels2 * v_weight2;
    3092             :     }
    3093             : 
    3094     3375702 :     v_acc1 += v_acc2;
    3095             : 
    3096     3375702 :     double dfVal = v_acc1.GetHorizSum();
    3097    11491480 :     for (; i < nSrcPixelCount; ++i)
    3098             :     {
    3099     8115780 :         dfVal += pChunk[i] * padfWeightsAligned[i];
    3100             :     }
    3101     3375702 :     return dfVal;
    3102             : }
    3103             : 
    3104             : /************************************************************************/
    3105             : /*               GDALResampleConvolutionHorizontal<GByte>               */
    3106             : /************************************************************************/
    3107             : 
    3108             : template <>
    3109     2826540 : inline double GDALResampleConvolutionHorizontal<GByte>(
    3110             :     const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3111             : {
    3112     2826540 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    3113     2826540 :                                                  nSrcPixelCount);
    3114             : }
    3115             : 
    3116             : template <>
    3117      549162 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
    3118             :     const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3119             : {
    3120      549162 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    3121      549162 :                                                  nSrcPixelCount);
    3122             : }
    3123             : 
    3124             : /************************************************************************/
    3125             : /*           GDALResampleConvolutionHorizontalWithMaskSSE2<T>           */
    3126             : /************************************************************************/
    3127             : 
    3128             : template <class T>
    3129    10626463 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
    3130             :     const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
    3131             :     int nSrcPixelCount, double &dfWeightValMaskSum, double &dfWeightMaskSum,
    3132             :     double &dfWeightSum)
    3133             : {
    3134    10626463 :     int i = 0;  // Used after for.
    3135    10626463 :     XMMReg4Double v_acc_val_mask_weight = XMMReg4Double::Zero();
    3136    10626463 :     XMMReg4Double v_acc_mask_weight = XMMReg4Double::Zero();
    3137    10626463 :     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
    3138    26199121 :     for (; i < nSrcPixelCount - 3; i += 4)
    3139             :     {
    3140    15572658 :         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
    3141    15572658 :         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
    3142    15572658 :         XMMReg4Double v_weight =
    3143    15572658 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3144    15572658 :         v_acc_weight += v_weight;
    3145    15572658 :         v_weight *= v_mask;
    3146    15572658 :         v_acc_val_mask_weight += v_pixels * v_weight;
    3147    15572658 :         v_acc_mask_weight += v_weight;
    3148             :     }
    3149             : 
    3150    10626463 :     dfWeightValMaskSum = v_acc_val_mask_weight.GetHorizSum();
    3151    10626463 :     dfWeightMaskSum = v_acc_mask_weight.GetHorizSum();
    3152    10626463 :     dfWeightSum = v_acc_weight.GetHorizSum();
    3153    10910963 :     for (; i < nSrcPixelCount; ++i)
    3154             :     {
    3155      284454 :         const double dfWeight = padfWeightsAligned[i];
    3156      284454 :         const double dfWeightMask = dfWeight * pabyMask[i];
    3157      284454 :         dfWeightValMaskSum += pChunk[i] * dfWeightMask;
    3158      284454 :         dfWeightMaskSum += dfWeightMask;
    3159      284454 :         dfWeightSum += dfWeight;
    3160             :     }
    3161    10626463 : }
    3162             : 
    3163             : /************************************************************************/
    3164             : /*           GDALResampleConvolutionHorizontalWithMask<GByte>           */
    3165             : /************************************************************************/
    3166             : 
    3167             : template <>
    3168    10626400 : inline void GDALResampleConvolutionHorizontalWithMask<GByte, false>(
    3169             :     const GByte *pChunk, const GByte *pabyMask,
    3170             :     const double *padfWeightsAligned, int nSrcPixelCount,
    3171             :     double &dfWeightValMaskSum, double &dfWeightMaskSum, double &dfWeightSum)
    3172             : {
    3173    10626400 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    3174             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount,
    3175             :         dfWeightValMaskSum, dfWeightMaskSum, dfWeightSum);
    3176    10626400 : }
    3177             : 
    3178             : template <>
    3179          63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16, false>(
    3180             :     const GUInt16 *pChunk, const GByte *pabyMask,
    3181             :     const double *padfWeightsAligned, int nSrcPixelCount,
    3182             :     double &dfWeightValMaskSum, double &dfWeightMaskSum, double &dfWeightSum)
    3183             : {
    3184          63 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    3185             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount,
    3186             :         dfWeightValMaskSum, dfWeightMaskSum, dfWeightSum);
    3187          63 : }
    3188             : 
    3189             : /************************************************************************/
    3190             : /*           GDALResampleConvolutionHorizontal_3rows_SSE2<T>            */
    3191             : /************************************************************************/
    3192             : 
    3193             : template <class T>
    3194    35560186 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
    3195             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3196             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3197             :     double &dfRes2, double &dfRes3)
    3198             : {
    3199    35560186 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
    3200    35560186 :                   v_acc2 = XMMReg4Double::Zero(),
    3201    35560186 :                   v_acc3 = XMMReg4Double::Zero();
    3202    35560186 :     int i = 0;
    3203    70929556 :     for (; i < nSrcPixelCount - 7; i += 8)
    3204             :     {
    3205             :         // Retrieve the pixel & accumulate.
    3206    35369370 :         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3207    35369370 :         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
    3208    35369370 :         const XMMReg4Double v_weight1 =
    3209    35369370 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3210    35369370 :         const XMMReg4Double v_weight2 =
    3211    35369370 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    3212             : 
    3213    35369370 :         v_acc1 += v_pixels1 * v_weight1;
    3214    35369370 :         v_acc1 += v_pixels2 * v_weight2;
    3215             : 
    3216    35369370 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3217    35369370 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
    3218    35369370 :         v_acc2 += v_pixels1 * v_weight1;
    3219    35369370 :         v_acc2 += v_pixels2 * v_weight2;
    3220             : 
    3221    35369370 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3222    35369370 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
    3223    35369370 :         v_acc3 += v_pixels1 * v_weight1;
    3224    35369370 :         v_acc3 += v_pixels2 * v_weight2;
    3225             :     }
    3226             : 
    3227    35560186 :     dfRes1 = v_acc1.GetHorizSum();
    3228    35560186 :     dfRes2 = v_acc2.GetHorizSum();
    3229    35560186 :     dfRes3 = v_acc3.GetHorizSum();
    3230    47825952 :     for (; i < nSrcPixelCount; ++i)
    3231             :     {
    3232    12265766 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3233    12265766 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3234    12265766 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3235             :     }
    3236    35560186 : }
    3237             : 
    3238             : /************************************************************************/
    3239             : /*            GDALResampleConvolutionHorizontal_3rows<GByte>            */
    3240             : /************************************************************************/
    3241             : 
    3242             : template <>
    3243    35560100 : inline void GDALResampleConvolutionHorizontal_3rows<GByte, false>(
    3244             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3245             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3246             :     double &dfRes2, double &dfRes3)
    3247             : {
    3248    35560100 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    3249             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3250             :         dfRes1, dfRes2, dfRes3);
    3251    35560100 : }
    3252             : 
    3253             : template <>
    3254          86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16, false>(
    3255             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3256             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3257             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3258             : {
    3259          86 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    3260             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3261             :         dfRes1, dfRes2, dfRes3);
    3262          86 : }
    3263             : 
    3264             : /************************************************************************/
    3265             : /*    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>    */
    3266             : /************************************************************************/
    3267             : 
    3268             : template <class T>
    3269     7849120 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3270             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3271             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3272             :     double &dfRes2, double &dfRes3)
    3273             : {
    3274     7849120 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3275     7849120 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    3276     7849120 :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    3277     7849120 :     int i = 0;  // Use after for.
    3278    19113750 :     for (; i < nSrcPixelCount - 3; i += 4)
    3279             :     {
    3280             :         // Retrieve the pixel & accumulate.
    3281    11264600 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3282    11264600 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3283    11264600 :         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3284    11264600 :         const XMMReg4Double v_weight =
    3285    11264600 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3286             : 
    3287    11264600 :         v_acc1 += v_pixels1 * v_weight;
    3288    11264600 :         v_acc2 += v_pixels2 * v_weight;
    3289    11264600 :         v_acc3 += v_pixels3 * v_weight;
    3290             :     }
    3291             : 
    3292     7849120 :     dfRes1 = v_acc1.GetHorizSum();
    3293     7849120 :     dfRes2 = v_acc2.GetHorizSum();
    3294     7849120 :     dfRes3 = v_acc3.GetHorizSum();
    3295             : 
    3296    12324622 :     for (; i < nSrcPixelCount; ++i)
    3297             :     {
    3298     4475522 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3299     4475522 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3300     4475522 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3301             :     }
    3302     7849120 : }
    3303             : 
    3304             : /************************************************************************/
    3305             : /*    GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>     */
    3306             : /************************************************************************/
    3307             : 
    3308             : template <>
    3309             : inline void
    3310     7781970 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte, false>(
    3311             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3312             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3313             :     double &dfRes2, double &dfRes3)
    3314             : {
    3315     7781970 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3316             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3317             :         dfRes1, dfRes2, dfRes3);
    3318     7781970 : }
    3319             : 
    3320             : template <>
    3321             : inline void
    3322       67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16, false>(
    3323             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3324             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3325             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3326             : {
    3327       67150 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3328             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3329             :         dfRes1, dfRes2, dfRes3);
    3330       67150 : }
    3331             : 
    3332             : /************************************************************************/
    3333             : /*      GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>      */
    3334             : /************************************************************************/
    3335             : 
    3336             : template <class T>
    3337    14904860 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3338             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3339             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3340             :     double &dfRes3)
    3341             : {
    3342    14904860 :     const XMMReg4Double v_weight =
    3343             :         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
    3344             : 
    3345             :     // Retrieve the pixel & accumulate.
    3346    14904860 :     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
    3347    14904860 :     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
    3348    14904860 :     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
    3349             : 
    3350    14904860 :     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
    3351    14904860 :     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
    3352    14904860 :     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
    3353             : 
    3354    14904860 :     dfRes1 = v_acc1.GetHorizSum();
    3355    14904860 :     dfRes2 = v_acc2.GetHorizSum();
    3356    14904860 :     dfRes3 = v_acc3.GetHorizSum();
    3357    14904860 : }
    3358             : 
    3359             : /************************************************************************/
    3360             : /*      GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>       */
    3361             : /************************************************************************/
    3362             : 
    3363             : template <>
    3364     9192140 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte, false>(
    3365             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3366             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3367             :     double &dfRes3)
    3368             : {
    3369     9192140 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3370             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3371             :         dfRes3);
    3372     9192140 : }
    3373             : 
    3374             : template <>
    3375     5712720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16, false>(
    3376             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3377             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
    3378             :     double &dfRes2, double &dfRes3)
    3379             : {
    3380     5712720 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3381             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3382             :         dfRes3);
    3383     5712720 : }
    3384             : 
    3385             : #endif  // USE_SSE2
    3386             : 
    3387             : /************************************************************************/
    3388             : /*                   GDALResampleChunk_Convolution()                    */
    3389             : /************************************************************************/
    3390             : 
    3391             : template <class T, class Twork, GDALDataType eWrkDataType,
    3392             :           bool bKernelWithNegativeWeights, bool bNeedRescale>
    3393        9597 : static CPLErr GDALResampleChunk_ConvolutionT(
    3394             :     const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
    3395             :     FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
    3396             :     int nKernelRadius, float fMaxVal)
    3397             : 
    3398             : {
    3399        9597 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    3400        9597 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    3401        9597 :     const double dfSrcXDelta = args.dfSrcXDelta;
    3402        9597 :     const double dfSrcYDelta = args.dfSrcYDelta;
    3403        9597 :     constexpr int nBands = 1;
    3404        9597 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    3405        9597 :     const int nChunkXOff = args.nChunkXOff;
    3406        9597 :     const int nChunkXSize = args.nChunkXSize;
    3407        9597 :     const int nChunkYOff = args.nChunkYOff;
    3408        9597 :     const int nChunkYSize = args.nChunkYSize;
    3409        9597 :     const int nDstXOff = args.nDstXOff;
    3410        9597 :     const int nDstXOff2 = args.nDstXOff2;
    3411        9597 :     const int nDstYOff = args.nDstYOff;
    3412        9597 :     const int nDstYOff2 = args.nDstYOff2;
    3413        9597 :     const bool bHasNoData = args.bHasNoData;
    3414        9597 :     double dfNoDataValue = args.dfNoDataValue;
    3415             : 
    3416        9597 :     if (!bHasNoData)
    3417        9498 :         dfNoDataValue = 0.0;
    3418        9597 :     const auto dstDataType = args.eOvrDataType;
    3419        9597 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
    3420        9597 :     const double dfReplacementVal =
    3421          99 :         bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
    3422             :                    : dfNoDataValue;
    3423             :     // cppcheck-suppress unreadVariable
    3424        9597 :     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
    3425        9597 :     const bool bNoDataValueInt64Valid =
    3426        9597 :         isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
    3427        9597 :     const auto nNodataValueInt64 =
    3428             :         bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
    3429        9597 :     constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
    3430             : 
    3431             :     // TODO: we should have some generic function to do this.
    3432        9597 :     Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
    3433        9597 :     Twork fDstMax = cpl::NumericLimits<Twork>::max();
    3434        9597 :     if (dstDataType == GDT_UInt8)
    3435             :     {
    3436        8667 :         fDstMin = std::numeric_limits<GByte>::min();
    3437        8667 :         fDstMax = std::numeric_limits<GByte>::max();
    3438             :     }
    3439         930 :     else if (dstDataType == GDT_Int8)
    3440             :     {
    3441           1 :         fDstMin = std::numeric_limits<GInt8>::min();
    3442           1 :         fDstMax = std::numeric_limits<GInt8>::max();
    3443             :     }
    3444         929 :     else if (dstDataType == GDT_UInt16)
    3445             :     {
    3446         402 :         fDstMin = std::numeric_limits<GUInt16>::min();
    3447         402 :         fDstMax = std::numeric_limits<GUInt16>::max();
    3448             :     }
    3449         527 :     else if (dstDataType == GDT_Int16)
    3450             :     {
    3451         292 :         fDstMin = std::numeric_limits<GInt16>::min();
    3452         292 :         fDstMax = std::numeric_limits<GInt16>::max();
    3453             :     }
    3454         235 :     else if (dstDataType == GDT_UInt32)
    3455             :     {
    3456           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
    3457           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
    3458             :     }
    3459         234 :     else if (dstDataType == GDT_Int32)
    3460             :     {
    3461             :         // cppcheck-suppress unreadVariable
    3462           6 :         fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
    3463             :         // cppcheck-suppress unreadVariable
    3464           6 :         fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
    3465             :     }
    3466         228 :     else if (dstDataType == GDT_UInt64)
    3467             :     {
    3468             :         // cppcheck-suppress unreadVariable
    3469           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
    3470             :         // cppcheck-suppress unreadVariable
    3471             :         // (1 << 64) - 2048: largest uint64 value a double can hold
    3472           1 :         fDstMax = static_cast<Twork>(18446744073709549568ULL);
    3473             :     }
    3474         227 :     else if (dstDataType == GDT_Int64)
    3475             :     {
    3476             :         // cppcheck-suppress unreadVariable
    3477           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
    3478             :         // cppcheck-suppress unreadVariable
    3479             :         // (1 << 63) - 1024: largest int64 that a double can hold
    3480           1 :         fDstMax = static_cast<Twork>(9223372036854774784LL);
    3481             :     }
    3482             : 
    3483        9597 :     bool bHasNaN = false;
    3484         490 :     if (pabyChunkNodataMask)
    3485             :     {
    3486             :         if constexpr (std::is_floating_point_v<T>)
    3487             :         {
    3488      120140 :             for (size_t i = 0;
    3489      120140 :                  i < static_cast<size_t>(nChunkXSize) * nChunkYSize; ++i)
    3490             :             {
    3491      120122 :                 if (std::isnan(pChunk[i]))
    3492             :                 {
    3493          24 :                     bHasNaN = true;
    3494          24 :                     break;
    3495             :                 }
    3496             :             }
    3497             :         }
    3498             :     }
    3499             : 
    3500    37413146 :     auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
    3501             :                                bNoDataValueInt64Valid, nNodataValueInt64,
    3502             :                                dfNoDataValue, dfReplacementVal](Twork fVal)
    3503             :     {
    3504    16299800 :         if (!bHasNoData)
    3505    12078600 :             return fVal;
    3506             : 
    3507             :         // Clamp value before comparing to nodata: this is only needed for
    3508             :         // kernels with negative weights (Lanczos)
    3509     4221160 :         Twork fClamped = fVal;
    3510     4221160 :         if (fClamped < fDstMin)
    3511       14504 :             fClamped = fDstMin;
    3512     4206660 :         else if (fClamped > fDstMax)
    3513       13638 :             fClamped = fDstMax;
    3514     4221160 :         if (isIntegerDT)
    3515             :         {
    3516     4220480 :             if (bNoDataValueInt64Valid)
    3517             :             {
    3518     4220470 :                 const double fClampedRounded = double(std::round(fClamped));
    3519     8440960 :                 if (fClampedRounded >=
    3520             :                         static_cast<double>(static_cast<Twork>(
    3521     8440960 :                             std::numeric_limits<int64_t>::min())) &&
    3522             :                     fClampedRounded <= static_cast<double>(static_cast<Twork>(
    3523     8440960 :                                            9223372036854774784LL)) &&
    3524     4220470 :                     nNodataValueInt64 ==
    3525     4220480 :                         static_cast<GInt64>(std::round(fClamped)))
    3526             :                 {
    3527             :                     // Do not use the nodata value
    3528       13195 :                     return static_cast<Twork>(dfReplacementVal);
    3529             :                 }
    3530             :             }
    3531             :         }
    3532         679 :         else if (dfNoDataValue == static_cast<double>(fClamped))
    3533             :         {
    3534             :             // Do not use the nodata value
    3535           1 :             return static_cast<Twork>(dfReplacementVal);
    3536             :         }
    3537     4207960 :         return fClamped;
    3538             :     };
    3539             : 
    3540             :     /* -------------------------------------------------------------------- */
    3541             :     /*      Allocate work buffers.                                          */
    3542             :     /* -------------------------------------------------------------------- */
    3543        9597 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    3544        9597 :     Twork *pafWrkScanline = nullptr;
    3545        9597 :     if (dstDataType != eWrkDataType)
    3546             :     {
    3547             :         pafWrkScanline =
    3548        9385 :             static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
    3549        9385 :         if (pafWrkScanline == nullptr)
    3550           0 :             return CE_Failure;
    3551             :     }
    3552             : 
    3553        9597 :     const double dfXScale = 1.0 / dfXRatioDstToSrc;
    3554        9597 :     const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
    3555        9597 :     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
    3556        9597 :     const double dfYScale = 1.0 / dfYRatioDstToSrc;
    3557        9597 :     const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
    3558        9597 :     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
    3559             : 
    3560             :     // Temporary array to store result of horizontal filter.
    3561             :     double *const padfHorizontalFiltered = static_cast<double *>(
    3562        9597 :         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
    3563        9597 :     const uint64_t nWeightCount = static_cast<uint64_t>(
    3564        9597 :         2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5);
    3565        9597 :     if (nWeightCount > std::numeric_limits<uint32_t>::max() / sizeof(double))
    3566             :     {
    3567           0 :         VSIFree(pafWrkScanline);
    3568           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    3569             :                  "Too large downsampling factor");
    3570           0 :         return CE_Failure;
    3571             :     }
    3572             :     // To store convolution coefficients.
    3573             :     double *const padfWeights =
    3574        9597 :         static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
    3575             :             static_cast<size_t>(nWeightCount) * sizeof(double)));
    3576             : 
    3577        9597 :     GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
    3578        9597 :     if (pabyChunkNodataMask)
    3579             :         pabyChunkNodataMaskHorizontalFiltered =
    3580        3357 :             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
    3581        9597 :     if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
    3582        3357 :         (pabyChunkNodataMask != nullptr &&
    3583             :          pabyChunkNodataMaskHorizontalFiltered == nullptr))
    3584             :     {
    3585           0 :         VSIFree(pafWrkScanline);
    3586           0 :         VSIFree(padfHorizontalFiltered);
    3587           0 :         VSIFreeAligned(padfWeights);
    3588           0 :         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3589           0 :         return CE_Failure;
    3590             :     }
    3591             : 
    3592             :     /* ==================================================================== */
    3593             :     /*      First pass: horizontal filter                                   */
    3594             :     /* ==================================================================== */
    3595        9597 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    3596             : #ifdef USE_SSE2
    3597        9597 :     const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
    3598             : #endif
    3599     3723494 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    3600             :     {
    3601     3713892 :         const double dfSrcPixel =
    3602     3713892 :             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
    3603     3713892 :         const int nSrcPixelStart = std::max(
    3604     3713892 :             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5)),
    3605     3713892 :             nChunkXOff);
    3606     3713892 :         const int nSrcPixelStop =
    3607     3713892 :             std::min(static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5),
    3608     3713892 :                      nChunkRightXOff);
    3609             : #if 0
    3610             :         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
    3611             :         {
    3612             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3613             :         }
    3614             :         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
    3615             :         {
    3616             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3617             :         }
    3618             : #endif
    3619     3713892 :         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
    3620     3713892 :         double dfWeightSum = 0.0;
    3621             : 
    3622             :         // Compute convolution coefficients.
    3623     3713892 :         int nSrcPixel = nSrcPixelStart;
    3624     3713892 :         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
    3625     5823636 :         for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
    3626             :         {
    3627     2109749 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
    3628     2109749 :             dfX += dfXScaleWeight;
    3629     2109749 :             padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
    3630     2109749 :             dfX += dfXScaleWeight;
    3631     2109749 :             padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
    3632     2109749 :             dfX += dfXScaleWeight;
    3633     2109749 :             padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
    3634     2109749 :             dfX += dfXScaleWeight;
    3635     2109749 :             dfWeightSum +=
    3636     2109749 :                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
    3637             :         }
    3638     7719022 :         for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
    3639             :         {
    3640     4005130 :             const double dfWeight = pfnFilterFunc(dfX);
    3641     4005130 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
    3642     4005130 :             dfWeightSum += dfWeight;
    3643             :         }
    3644             : 
    3645     3713892 :         const int nHeight = nChunkYSize * nBands;
    3646     3713892 :         if (pabyChunkNodataMask == nullptr)
    3647             :         {
    3648             :             // For floating-point data types, we must scale down a bit values
    3649             :             // if input values are close to +/- std::numeric_limits<T>::max()
    3650             : #ifdef OLD_CPPCHECK
    3651             :             constexpr double mulFactor = 1;
    3652             : #else
    3653     3191883 :             constexpr double mulFactor =
    3654             :                 (bNeedRescale &&
    3655             :                  (std::is_same_v<T, float> || std::is_same_v<T, double>))
    3656             :                     ? 2
    3657             :                     : 1;
    3658             : #endif
    3659             : 
    3660     3191883 :             if (dfWeightSum != 0)
    3661             :             {
    3662     3191883 :                 const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
    3663    13086524 :                 for (int i = 0; i < nSrcPixelCount; ++i)
    3664             :                 {
    3665     9894651 :                     padfWeights[i] *= dfInvWeightSum;
    3666             :                 }
    3667             :             }
    3668             : 
    3669   182388430 :             const auto ScaleValue = [
    3670             : #ifdef _MSC_VER
    3671             :                                         mulFactor
    3672             : #endif
    3673             :             ](double dfVal, [[maybe_unused]] const T *inputValues,
    3674             :                                     [[maybe_unused]] int nInputValues)
    3675             :             {
    3676   182388000 :                 constexpr bool isFloat =
    3677             :                     std::is_same_v<T, float> || std::is_same_v<T, double>;
    3678             :                 if constexpr (isFloat)
    3679             :                 {
    3680     4070140 :                     if (std::isfinite(dfVal))
    3681             :                     {
    3682             :                         return std::clamp(dfVal,
    3683    12204800 :                                           -std::numeric_limits<double>::max() /
    3684             :                                               mulFactor,
    3685     4068260 :                                           std::numeric_limits<double>::max() /
    3686     4068260 :                                               mulFactor) *
    3687     4068260 :                                mulFactor;
    3688             :                     }
    3689             :                     else if constexpr (bKernelWithNegativeWeights)
    3690             :                     {
    3691         936 :                         if (std::isnan(dfVal))
    3692             :                         {
    3693             :                             // Either one of the input value is NaN or they are +/-Inf
    3694         936 :                             const bool isPositive = inputValues[0] >= 0;
    3695        6008 :                             for (int i = 0; i < nInputValues; ++i)
    3696             :                             {
    3697        5384 :                                 if (std::isnan(inputValues[i]))
    3698         312 :                                     return dfVal;
    3699             :                                 // cppcheck-suppress knownConditionTrueFalse
    3700        5072 :                                 if ((inputValues[i] >= 0) != isPositive)
    3701           0 :                                     return dfVal;
    3702             :                             }
    3703             :                             // All values are positive or negative infinity
    3704         624 :                             return static_cast<double>(inputValues[0]);
    3705             :                         }
    3706             :                     }
    3707             :                 }
    3708   178319000 :                 return dfVal;
    3709             :             };
    3710             : 
    3711     3191883 :             int iSrcLineOff = 0;
    3712             : #ifdef USE_SSE2
    3713     3191883 :             if (nSrcPixelCount == 4)
    3714             :             {
    3715    17007029 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3716             :                 {
    3717    16161558 :                     const size_t j =
    3718    16161558 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3719    16161558 :                         (nSrcPixelStart - nChunkXOff);
    3720    16161558 :                     double dfVal1 = 0.0;
    3721    16161558 :                     double dfVal2 = 0.0;
    3722    16161558 :                     double dfVal3 = 0.0;
    3723             :                     if constexpr (std::is_floating_point_v<T>)
    3724             :                     {
    3725     1256690 :                         if (bHasNaN)
    3726             :                         {
    3727             :                             GDALResampleConvolutionHorizontalPixelCount4_3rows<
    3728           0 :                                 T, true>(pChunk + j, pChunk + j + nChunkXSize,
    3729           0 :                                          pChunk + j + 2 * nChunkXSize,
    3730             :                                          padfWeights, dfVal1, dfVal2, dfVal3);
    3731             :                         }
    3732             :                         else
    3733             :                         {
    3734             :                             GDALResampleConvolutionHorizontalPixelCount4_3rows<
    3735     1256690 :                                 T, false>(pChunk + j, pChunk + j + nChunkXSize,
    3736     1256690 :                                           pChunk + j + 2 * nChunkXSize,
    3737             :                                           padfWeights, dfVal1, dfVal2, dfVal3);
    3738             :                         }
    3739             :                     }
    3740             :                     else
    3741             :                     {
    3742             :                         GDALResampleConvolutionHorizontalPixelCount4_3rows<
    3743    14904868 :                             T, false>(pChunk + j, pChunk + j + nChunkXSize,
    3744    14904868 :                                       pChunk + j + 2 * nChunkXSize, padfWeights,
    3745             :                                       dfVal1, dfVal2, dfVal3);
    3746             :                     }
    3747    32323080 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3748    16161558 :                                                nDstXSize +
    3749    16161558 :                                            iDstPixel - nDstXOff] =
    3750    16161558 :                         ScaleValue(dfVal1, pChunk + j, 4);
    3751    32323080 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3752    16161558 :                                             1) *
    3753    16161558 :                                                nDstXSize +
    3754    16161558 :                                            iDstPixel - nDstXOff] =
    3755    16161558 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
    3756    16161967 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3757    16161558 :                                             2) *
    3758    16161558 :                                                nDstXSize +
    3759    16161558 :                                            iDstPixel - nDstXOff] =
    3760    16161558 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
    3761             :                 }
    3762             :             }
    3763     2346404 :             else if (bSrcPixelCountLess8)
    3764             :             {
    3765     9938308 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3766             :                 {
    3767     7868098 :                     const size_t j =
    3768     7868098 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3769     7868098 :                         (nSrcPixelStart - nChunkXOff);
    3770     7868098 :                     double dfVal1 = 0.0;
    3771     7868098 :                     double dfVal2 = 0.0;
    3772     7868098 :                     double dfVal3 = 0.0;
    3773             :                     if constexpr (std::is_floating_point_v<T>)
    3774             :                     {
    3775       18980 :                         if (bHasNaN)
    3776             :                         {
    3777             :                             GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
    3778           0 :                                 T, true>(pChunk + j, pChunk + j + nChunkXSize,
    3779           0 :                                          pChunk + j + 2 * nChunkXSize,
    3780             :                                          padfWeights, nSrcPixelCount, dfVal1,
    3781             :                                          dfVal2, dfVal3);
    3782             :                         }
    3783             :                         else
    3784             :                         {
    3785             :                             GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
    3786       18980 :                                 T, false>(pChunk + j, pChunk + j + nChunkXSize,
    3787       18980 :                                           pChunk + j + 2 * nChunkXSize,
    3788             :                                           padfWeights, nSrcPixelCount, dfVal1,
    3789             :                                           dfVal2, dfVal3);
    3790             :                         }
    3791             :                     }
    3792             :                     else
    3793             :                     {
    3794             :                         GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
    3795     7849118 :                             T, false>(pChunk + j, pChunk + j + nChunkXSize,
    3796     7849118 :                                       pChunk + j + 2 * nChunkXSize, padfWeights,
    3797             :                                       nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3798             :                     }
    3799    15736156 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3800     7868098 :                                                nDstXSize +
    3801     7868098 :                                            iDstPixel - nDstXOff] =
    3802     7868098 :                         ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
    3803    15736156 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3804     7868098 :                                             1) *
    3805     7868098 :                                                nDstXSize +
    3806     7868098 :                                            iDstPixel - nDstXOff] =
    3807     7868098 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize,
    3808             :                                    nSrcPixelCount);
    3809     7868186 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3810     7868098 :                                             2) *
    3811     7868098 :                                                nDstXSize +
    3812     7868098 :                                            iDstPixel - nDstXOff] =
    3813     7868098 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
    3814             :                                    nSrcPixelCount);
    3815             :                 }
    3816             :             }
    3817             :             else
    3818             : #endif
    3819             :             {
    3820    35902058 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3821             :                 {
    3822    35625944 :                     const size_t j =
    3823    35625944 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3824    35625944 :                         (nSrcPixelStart - nChunkXOff);
    3825    35625944 :                     double dfVal1 = 0.0;
    3826    35625944 :                     double dfVal2 = 0.0;
    3827    35625944 :                     double dfVal3 = 0.0;
    3828             :                     if constexpr (std::is_floating_point_v<T>)
    3829             :                     {
    3830       65696 :                         if (bHasNaN)
    3831             :                         {
    3832           0 :                             GDALResampleConvolutionHorizontal_3rows<T, true>(
    3833           0 :                                 pChunk + j, pChunk + j + nChunkXSize,
    3834           0 :                                 pChunk + j + 2 * nChunkXSize, padfWeights,
    3835             :                                 nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3836             :                         }
    3837             :                         else
    3838             :                         {
    3839       65696 :                             GDALResampleConvolutionHorizontal_3rows<T, false>(
    3840       65696 :                                 pChunk + j, pChunk + j + nChunkXSize,
    3841       65696 :                                 pChunk + j + 2 * nChunkXSize, padfWeights,
    3842             :                                 nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3843             :                         }
    3844             :                     }
    3845             :                     else
    3846             :                     {
    3847    35560248 :                         GDALResampleConvolutionHorizontal_3rows<T, false>(
    3848    35560248 :                             pChunk + j, pChunk + j + nChunkXSize,
    3849    35560248 :                             pChunk + j + 2 * nChunkXSize, padfWeights,
    3850             :                             nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3851             :                     }
    3852    71251798 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3853    35625944 :                                                nDstXSize +
    3854    35625944 :                                            iDstPixel - nDstXOff] =
    3855    35625944 :                         ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
    3856    71251798 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3857    35625944 :                                             1) *
    3858    35625944 :                                                nDstXSize +
    3859    35625944 :                                            iDstPixel - nDstXOff] =
    3860    35625944 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize,
    3861             :                                    nSrcPixelCount);
    3862    35691048 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3863    35625944 :                                             2) *
    3864    35625944 :                                                nDstXSize +
    3865    35625944 :                                            iDstPixel - nDstXOff] =
    3866    35625944 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
    3867             :                                    nSrcPixelCount);
    3868             :                 }
    3869             :             }
    3870     6613620 :             for (; iSrcLineOff < nHeight; ++iSrcLineOff)
    3871             :             {
    3872     3421743 :                 const size_t j =
    3873     3421743 :                     static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3874     3421743 :                     (nSrcPixelStart - nChunkXOff);
    3875     3970903 :                 const double dfVal = GDALResampleConvolutionHorizontal(
    3876      595200 :                     pChunk + j, padfWeights, nSrcPixelCount);
    3877     3422192 :                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3878     3421743 :                                            nDstXSize +
    3879     3421743 :                                        iDstPixel - nDstXOff] =
    3880     3421743 :                     ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
    3881             :             }
    3882             :         }
    3883             :         else
    3884             :         {
    3885    32759623 :             for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
    3886             :             {
    3887    32237528 :                 const size_t j =
    3888    32237528 :                     static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3889    32237528 :                     (nSrcPixelStart - nChunkXOff);
    3890             : 
    3891             :                 if (bKernelWithNegativeWeights)
    3892             :                 {
    3893    27492508 :                     int nConsecutiveValid = 0;
    3894    27492508 :                     int nMaxConsecutiveValid = 0;
    3895   747674146 :                     for (int k = 0; k < nSrcPixelCount; k++)
    3896             :                     {
    3897   720181938 :                         if (pabyChunkNodataMask[j + k])
    3898    43694301 :                             nConsecutiveValid++;
    3899   676487837 :                         else if (nConsecutiveValid)
    3900             :                         {
    3901      107658 :                             nMaxConsecutiveValid = std::max(
    3902      107658 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3903      107658 :                             nConsecutiveValid = 0;
    3904             :                         }
    3905             :                     }
    3906    27492508 :                     nMaxConsecutiveValid =
    3907    27492508 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3908    27492508 :                     if (nMaxConsecutiveValid < nSrcPixelCount / 2)
    3909             :                     {
    3910    21564707 :                         const size_t nTempOffset =
    3911    21564707 :                             static_cast<size_t>(iSrcLineOff) * nDstXSize +
    3912    21564707 :                             iDstPixel - nDstXOff;
    3913    21564707 :                         padfHorizontalFiltered[nTempOffset] = 0.0;
    3914    21564707 :                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3915    21564707 :                         continue;
    3916             :                     }
    3917             :                 }
    3918             : 
    3919    10672871 :                 double dfSumWeightedVal = 0.0;
    3920    10672871 :                 double dfSumWeightedAlpha = 0.0;
    3921             :                 if constexpr (std::is_floating_point_v<T>)
    3922             :                 {
    3923       46368 :                     if (bHasNaN)
    3924             :                     {
    3925        1792 :                         GDALResampleConvolutionHorizontalWithMask<T, true>(
    3926        1792 :                             pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3927             :                             nSrcPixelCount, dfSumWeightedVal,
    3928             :                             dfSumWeightedAlpha, dfWeightSum);
    3929             :                     }
    3930             :                     else
    3931             :                     {
    3932       44576 :                         GDALResampleConvolutionHorizontalWithMask<T, false>(
    3933       44576 :                             pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3934             :                             nSrcPixelCount, dfSumWeightedVal,
    3935             :                             dfSumWeightedAlpha, dfWeightSum);
    3936             :                     }
    3937             :                 }
    3938             :                 else
    3939             :                 {
    3940    10626503 :                     GDALResampleConvolutionHorizontalWithMask<T, false>(
    3941          63 :                         pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3942             :                         nSrcPixelCount, dfSumWeightedVal, dfSumWeightedAlpha,
    3943             :                         dfWeightSum);
    3944             :                 }
    3945    10672871 :                 const size_t nTempOffset =
    3946    10672871 :                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
    3947    10672871 :                     nDstXOff;
    3948    10672871 :                 if (dfSumWeightedAlpha > 0.0)
    3949             :                 {
    3950     8760088 :                     padfHorizontalFiltered[nTempOffset] =
    3951     8760088 :                         dfSumWeightedVal / dfSumWeightedAlpha;
    3952             :                     // Not entirely clear if clamping values in the horizontal filter
    3953             :                     // is the right thing to do, but otherwise, for
    3954             :                     // https://github.com/OSGeo/gdal/issues/14728
    3955             :                     // with very small values of alpha, we get very strong under
    3956             :                     // and over shoots.
    3957             :                     if constexpr (std::is_same_v<T, uint8_t>)
    3958             :                     {
    3959     8713690 :                         padfHorizontalFiltered[nTempOffset] = std::clamp(
    3960     8713690 :                             padfHorizontalFiltered[nTempOffset], 0.0, 255.0);
    3961             :                     }
    3962             :                     else if constexpr (std::is_same_v<T, uint16_t>)
    3963             :                     {
    3964          60 :                         padfHorizontalFiltered[nTempOffset] = std::clamp(
    3965          60 :                             padfHorizontalFiltered[nTempOffset], 0.0, 65535.0);
    3966             :                     }
    3967     8760088 :                     const double dfAlpha = dfSumWeightedAlpha / dfWeightSum;
    3968     8760088 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] =
    3969     8760088 :                         static_cast<uint8_t>(std::min(dfAlpha + 0.5, 255.0));
    3970             :                 }
    3971             :                 else
    3972             :                 {
    3973     1912797 :                     padfHorizontalFiltered[nTempOffset] = 0.0;
    3974     1912797 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3975             :                 }
    3976             :             }
    3977             :         }
    3978             :     }
    3979             : 
    3980             :     /* ==================================================================== */
    3981             :     /*      Second pass: vertical filter                                    */
    3982             :     /* ==================================================================== */
    3983        9597 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    3984             : 
    3985      414141 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3986             :     {
    3987      404544 :         Twork *const pafDstScanline =
    3988             :             pafWrkScanline
    3989      404544 :                 ? pafWrkScanline
    3990       14028 :                 : static_cast<Twork *>(pDstBuffer) +
    3991       14028 :                       static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
    3992             : 
    3993      404544 :         const double dfSrcLine =
    3994      404544 :             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
    3995      404544 :         const int nSrcLineStart =
    3996      404544 :             std::max(static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5)),
    3997      404544 :                      nChunkYOff);
    3998      404544 :         const int nSrcLineStop =
    3999      404544 :             std::min(static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5),
    4000      404544 :                      nChunkBottomYOff);
    4001             : #if 0
    4002             :         if( nSrcLineStart < nChunkYOff &&
    4003             :             nChunkYOff > 0 )
    4004             :         {
    4005             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    4006             :         }
    4007             :         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
    4008             :         {
    4009             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    4010             :         }
    4011             : #endif
    4012      404544 :         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
    4013      404544 :         double dfWeightSum = 0.0;
    4014             : 
    4015             :         // Compute convolution coefficients.
    4016      404544 :         int nSrcLine = nSrcLineStart;  // Used after for.
    4017      404544 :         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
    4018     1076797 :         for (; nSrcLine < nSrcLineStop - 3;
    4019      672253 :              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
    4020             :         {
    4021      672253 :             padfWeights[nSrcLine - nSrcLineStart] = dfY;
    4022      672253 :             padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
    4023      672253 :             padfWeights[nSrcLine + 2 - nSrcLineStart] =
    4024      672253 :                 dfY + 2 * dfYScaleWeight;
    4025      672253 :             padfWeights[nSrcLine + 3 - nSrcLineStart] =
    4026      672253 :                 dfY + 3 * dfYScaleWeight;
    4027      672253 :             dfWeightSum +=
    4028      672253 :                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
    4029             :         }
    4030      443434 :         for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
    4031             :         {
    4032       38890 :             const double dfWeight = pfnFilterFunc(dfY);
    4033       38890 :             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
    4034       38890 :             dfWeightSum += dfWeight;
    4035             :         }
    4036             : 
    4037      404544 :         if (pabyChunkNodataMask == nullptr)
    4038             :         {
    4039             :             // For floating-point data types, we must scale down a bit values
    4040             :             // if input values are close to +/- std::numeric_limits<T>::max()
    4041             : #ifdef OLD_CPPCHECK
    4042             :             constexpr double mulFactor = 1;
    4043             : #else
    4044      360192 :             constexpr double mulFactor =
    4045             :                 (bNeedRescale &&
    4046             :                  (std::is_same_v<T, float> || std::is_same_v<T, double>))
    4047             :                     ? 2
    4048             :                     : 1;
    4049             : #endif
    4050             : 
    4051      360192 :             if (dfWeightSum != 0)
    4052             :             {
    4053      360192 :                 const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
    4054     2617653 :                 for (int i = 0; i < nSrcLineCount; ++i)
    4055     2257467 :                     padfWeights[i] *= dfInvWeightSum;
    4056             :             }
    4057             : 
    4058      360192 :             int iFilteredPixelOff = 0;  // Used after for.
    4059             :             // j used after for.
    4060      360192 :             size_t j =
    4061      360192 :                 (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
    4062             : #ifdef USE_SSE2
    4063             :             if constexpr ((!bNeedRescale || !std::is_same_v<T, float>) &&
    4064             :                           eWrkDataType == GDT_Float32)
    4065             :             {
    4066             : #ifdef __AVX__
    4067             :                 for (; iFilteredPixelOff < nDstXSize - 15;
    4068             :                      iFilteredPixelOff += 16, j += 16)
    4069             :                 {
    4070             :                     GDALResampleConvolutionVertical_16cols(
    4071             :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4072             :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    4073             :                     if (bHasNoData)
    4074             :                     {
    4075             :                         for (int k = 0; k < 16; k++)
    4076             :                         {
    4077             :                             pafDstScanline[iFilteredPixelOff + k] =
    4078             :                                 replaceValIfNodata(
    4079             :                                     pafDstScanline[iFilteredPixelOff + k]);
    4080             :                         }
    4081             :                     }
    4082             :                 }
    4083             : #else
    4084    26155459 :                 for (; iFilteredPixelOff < nDstXSize - 7;
    4085             :                      iFilteredPixelOff += 8, j += 8)
    4086             :                 {
    4087    25804048 :                     GDALResampleConvolutionVertical_8cols(
    4088    25804048 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4089    25804048 :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    4090    25804048 :                     if (bHasNoData)
    4091             :                     {
    4092      123192 :                         for (int k = 0; k < 8; k++)
    4093             :                         {
    4094      109504 :                             pafDstScanline[iFilteredPixelOff + k] =
    4095      109504 :                                 replaceValIfNodata(
    4096      109504 :                                     pafDstScanline[iFilteredPixelOff + k]);
    4097             :                         }
    4098             :                     }
    4099             :                 }
    4100             : #endif
    4101             : 
    4102      822491 :                 for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
    4103             :                 {
    4104      471118 :                     const Twork fVal =
    4105      471118 :                         static_cast<Twork>(GDALResampleConvolutionVertical(
    4106      471118 :                             padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4107             :                             nSrcLineCount));
    4108      471118 :                     pafDstScanline[iFilteredPixelOff] =
    4109      471118 :                         replaceValIfNodata(fVal);
    4110             :                 }
    4111             :             }
    4112             :             else
    4113             : #endif
    4114             :             {
    4115     5862642 :                 const auto ScaleValue = [
    4116             : #ifdef _MSC_VER
    4117             :                                             mulFactor
    4118             : #endif
    4119             :                 ](double dfVal, [[maybe_unused]] const double *inputValues,
    4120             :                                         [[maybe_unused]] int nStride,
    4121             :                                         [[maybe_unused]] int nInputValues)
    4122             :                 {
    4123     5862640 :                     constexpr bool isFloat =
    4124             :                         std::is_same_v<T, float> || std::is_same_v<T, double>;
    4125             :                     if constexpr (isFloat)
    4126             :                     {
    4127     5862640 :                         if (std::isfinite(dfVal))
    4128             :                         {
    4129             :                             return std::clamp(
    4130             :                                        dfVal,
    4131             :                                        static_cast<double>(
    4132    17585400 :                                            -std::numeric_limits<Twork>::max()) /
    4133             :                                            mulFactor,
    4134             :                                        static_cast<double>(
    4135     5861800 :                                            std::numeric_limits<Twork>::max()) /
    4136     5861800 :                                            mulFactor) *
    4137     5861800 :                                    mulFactor;
    4138             :                         }
    4139             :                         else if constexpr (bKernelWithNegativeWeights)
    4140             :                         {
    4141         480 :                             if (std::isnan(dfVal))
    4142             :                             {
    4143             :                                 // Either one of the input value is NaN or they are +/-Inf
    4144         480 :                                 const bool isPositive = inputValues[0] >= 0;
    4145        2520 :                                 for (int i = 0; i < nInputValues; ++i)
    4146             :                                 {
    4147        2200 :                                     if (std::isnan(inputValues[i * nStride]))
    4148         160 :                                         return dfVal;
    4149             :                                     // cppcheck-suppress knownConditionTrueFalse
    4150        2040 :                                     if ((inputValues[i] >= 0) != isPositive)
    4151           0 :                                         return dfVal;
    4152             :                                 }
    4153             :                                 // All values are positive or negative infinity
    4154         320 :                                 return inputValues[0];
    4155             :                             }
    4156             :                         }
    4157             :                     }
    4158             : 
    4159         360 :                     return dfVal;
    4160             :                 };
    4161             : 
    4162     2939422 :                 for (; iFilteredPixelOff < nDstXSize - 1;
    4163             :                      iFilteredPixelOff += 2, j += 2)
    4164             :                 {
    4165     2930610 :                     double dfVal1 = 0.0;
    4166     2930610 :                     double dfVal2 = 0.0;
    4167     2930610 :                     GDALResampleConvolutionVertical_2cols(
    4168     2930610 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4169             :                         nSrcLineCount, dfVal1, dfVal2);
    4170     5861220 :                     pafDstScanline[iFilteredPixelOff] =
    4171     2930610 :                         replaceValIfNodata(static_cast<Twork>(
    4172     2930610 :                             ScaleValue(dfVal1, padfHorizontalFiltered + j,
    4173             :                                        nDstXSize, nSrcLineCount)));
    4174     2930610 :                     pafDstScanline[iFilteredPixelOff + 1] =
    4175     2930610 :                         replaceValIfNodata(static_cast<Twork>(
    4176     2930610 :                             ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
    4177             :                                        nDstXSize, nSrcLineCount)));
    4178             :                 }
    4179        8819 :                 if (iFilteredPixelOff < nDstXSize)
    4180             :                 {
    4181        1427 :                     const double dfVal = GDALResampleConvolutionVertical(
    4182        1427 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4183             :                         nSrcLineCount);
    4184        1427 :                     pafDstScanline[iFilteredPixelOff] =
    4185        1427 :                         replaceValIfNodata(static_cast<Twork>(
    4186        1427 :                             ScaleValue(dfVal, padfHorizontalFiltered + j,
    4187             :                                        nDstXSize, nSrcLineCount)));
    4188             :                 }
    4189             :             }
    4190             :         }
    4191             :         else
    4192             :         {
    4193    19948965 :             for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
    4194             :                  ++iFilteredPixelOff)
    4195             :             {
    4196    19904685 :                 double dfVal = 0.0;
    4197    19904685 :                 dfWeightSum = 0.0;
    4198    19904685 :                 size_t j = (nSrcLineStart - nChunkYOff) *
    4199    19904685 :                                static_cast<size_t>(nDstXSize) +
    4200    19904685 :                            iFilteredPixelOff;
    4201             :                 if (bKernelWithNegativeWeights)
    4202             :                 {
    4203    18637437 :                     int nConsecutiveValid = 0;
    4204    18637437 :                     int nMaxConsecutiveValid = 0;
    4205   162845921 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    4206             :                     {
    4207   144208284 :                         const double dfWeight =
    4208   144208284 :                             padfWeights[i] *
    4209             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    4210   144208284 :                         if (pabyChunkNodataMaskHorizontalFiltered[j])
    4211             :                         {
    4212    45969501 :                             nConsecutiveValid++;
    4213             :                         }
    4214    98238683 :                         else if (nConsecutiveValid)
    4215             :                         {
    4216      211128 :                             nMaxConsecutiveValid = std::max(
    4217      211128 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    4218      211128 :                             nConsecutiveValid = 0;
    4219             :                         }
    4220   144208284 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    4221   144208284 :                         dfWeightSum += dfWeight;
    4222             :                     }
    4223    18637437 :                     nMaxConsecutiveValid =
    4224    18637437 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    4225    18637437 :                     if (nMaxConsecutiveValid < nSrcLineCount / 2)
    4226             :                     {
    4227     9501801 :                         pafDstScanline[iFilteredPixelOff] =
    4228     9501709 :                             static_cast<Twork>(dfNoDataValue);
    4229     9501801 :                         continue;
    4230             :                     }
    4231             :                 }
    4232             :                 else
    4233             :                 {
    4234     6353336 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    4235             :                     {
    4236     5086078 :                         const double dfWeight =
    4237     5086078 :                             padfWeights[i] *
    4238             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    4239     5086078 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    4240     5086078 :                         dfWeightSum += dfWeight;
    4241             :                     }
    4242             :                 }
    4243    10402854 :                 if (dfWeightSum > 0.0)
    4244             :                 {
    4245     9856520 :                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
    4246     9856172 :                         static_cast<Twork>(dfVal / dfWeightSum));
    4247             :                 }
    4248             :                 else
    4249             :                 {
    4250      546347 :                     pafDstScanline[iFilteredPixelOff] =
    4251      546323 :                         static_cast<Twork>(dfNoDataValue);
    4252             :                 }
    4253             :             }
    4254             :         }
    4255             : 
    4256      404544 :         if (fMaxVal != 0.0f)
    4257             :         {
    4258             :             if constexpr (std::is_same_v<T, double>)
    4259             :             {
    4260           0 :                 for (int i = 0; i < nDstXSize; ++i)
    4261             :                 {
    4262           0 :                     if (pafDstScanline[i] > static_cast<double>(fMaxVal))
    4263           0 :                         pafDstScanline[i] = static_cast<double>(fMaxVal);
    4264             :                 }
    4265             :             }
    4266             :             else
    4267             :             {
    4268      192324 :                 for (int i = 0; i < nDstXSize; ++i)
    4269             :                 {
    4270      192088 :                     if (pafDstScanline[i] > fMaxVal)
    4271       96022 :                         pafDstScanline[i] = fMaxVal;
    4272             :                 }
    4273             :             }
    4274             :         }
    4275             : 
    4276      404544 :         if (pafWrkScanline)
    4277             :         {
    4278      390516 :             GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
    4279             :                             static_cast<GByte *>(pDstBuffer) +
    4280      390516 :                                 static_cast<size_t>(iDstLine - nDstYOff) *
    4281      390516 :                                     nDstXSize * nDstDataTypeSize,
    4282             :                             dstDataType, nDstDataTypeSize, nDstXSize);
    4283             :         }
    4284             :     }
    4285             : 
    4286        9597 :     VSIFree(pafWrkScanline);
    4287        9597 :     VSIFreeAligned(padfWeights);
    4288        9597 :     VSIFree(padfHorizontalFiltered);
    4289        9597 :     VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    4290             : 
    4291        9597 :     return CE_None;
    4292             : }
    4293             : 
    4294             : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
    4295             : static CPLErr
    4296        9597 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
    4297             :                                       const void *pChunk, void **ppDstBuffer,
    4298             :                                       GDALDataType *peDstBufferDataType)
    4299             : {
    4300             :     GDALResampleAlg eResample;
    4301        9597 :     if (EQUAL(args.pszResampling, "BILINEAR"))
    4302        7097 :         eResample = GRA_Bilinear;
    4303        2500 :     else if (EQUAL(args.pszResampling, "CUBIC"))
    4304        2318 :         eResample = GRA_Cubic;
    4305         182 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    4306          86 :         eResample = GRA_CubicSpline;
    4307          96 :     else if (EQUAL(args.pszResampling, "LANCZOS"))
    4308          96 :         eResample = GRA_Lanczos;
    4309             :     else
    4310             :     {
    4311           0 :         CPLAssert(false);
    4312             :         return CE_Failure;
    4313             :     }
    4314        9597 :     const int nKernelRadius = GWKGetFilterRadius(eResample);
    4315        9597 :     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
    4316             :     const FilterFunc4ValuesType pfnFilterFunc4Values =
    4317        9597 :         GWKGetFilterFunc4Values(eResample);
    4318             : 
    4319        9597 :     float fMaxVal = 0.f;
    4320             :     // Cubic, etc... can have overshoots, so make sure we clamp values to the
    4321             :     // maximum value if NBITS is set.
    4322        9597 :     if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
    4323           8 :         (args.eOvrDataType == GDT_UInt8 || args.eOvrDataType == GDT_UInt16 ||
    4324           0 :          args.eOvrDataType == GDT_UInt32))
    4325             :     {
    4326           8 :         int nBits = args.nOvrNBITS;
    4327           8 :         if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
    4328           1 :             nBits = 0;
    4329           8 :         if (nBits > 0 && nBits < 32)
    4330           7 :             fMaxVal = static_cast<float>((1U << nBits) - 1);
    4331             :     }
    4332             : 
    4333        9597 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    4334             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    4335             :         GDALGetDataTypeSizeBytes(args.eOvrDataType));
    4336        9597 :     if (*ppDstBuffer == nullptr)
    4337             :     {
    4338           0 :         return CE_Failure;
    4339             :     }
    4340        9597 :     *peDstBufferDataType = args.eOvrDataType;
    4341             : 
    4342        9597 :     switch (args.eWrkDataType)
    4343             :     {
    4344        8705 :         case GDT_UInt8:
    4345             :         {
    4346             :             return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
    4347             :                                                   bKernelWithNegativeWeights,
    4348        8705 :                                                   bNeedRescale>(
    4349             :                 args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
    4350        8705 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4351             :         }
    4352             : 
    4353         402 :         case GDT_UInt16:
    4354             :         {
    4355             :             return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
    4356             :                                                   bKernelWithNegativeWeights,
    4357         402 :                                                   bNeedRescale>(
    4358             :                 args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
    4359         402 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4360             :         }
    4361             : 
    4362         387 :         case GDT_Float32:
    4363             :         {
    4364             :             return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
    4365             :                                                   bKernelWithNegativeWeights,
    4366         387 :                                                   bNeedRescale>(
    4367             :                 args, static_cast<const float *>(pChunk), *ppDstBuffer,
    4368         387 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4369             :         }
    4370             : 
    4371         103 :         case GDT_Float64:
    4372             :         {
    4373             :             return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
    4374             :                                                   bKernelWithNegativeWeights,
    4375         103 :                                                   bNeedRescale>(
    4376             :                 args, static_cast<const double *>(pChunk), *ppDstBuffer,
    4377         103 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4378             :         }
    4379             : 
    4380           0 :         default:
    4381           0 :             break;
    4382             :     }
    4383             : 
    4384           0 :     CPLAssert(false);
    4385             :     return CE_Failure;
    4386             : }
    4387             : 
    4388             : static CPLErr
    4389        9597 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
    4390             :                               const void *pChunk, void **ppDstBuffer,
    4391             :                               GDALDataType *peDstBufferDataType)
    4392             : {
    4393        9597 :     if (EQUAL(args.pszResampling, "CUBIC") ||
    4394        7279 :         EQUAL(args.pszResampling, "LANCZOS"))
    4395             :         return GDALResampleChunk_ConvolutionInternal<
    4396        2414 :             /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
    4397        2414 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4398        7183 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    4399          86 :         return GDALResampleChunk_ConvolutionInternal<false, true>(
    4400          86 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4401             :     else
    4402        7097 :         return GDALResampleChunk_ConvolutionInternal<false, false>(
    4403        7097 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4404             : }
    4405             : 
    4406             : /************************************************************************/
    4407             : /*                       GDALResampleChunkC32R()                        */
    4408             : /************************************************************************/
    4409             : 
    4410           2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
    4411             :                                     const float *pafChunk, const int nChunkYOff,
    4412             :                                     const int nChunkYSize, const int nDstYOff,
    4413             :                                     const int nDstYOff2, const int nOvrXSize,
    4414             :                                     const int nOvrYSize, void **ppDstBuffer,
    4415             :                                     GDALDataType *peDstBufferDataType,
    4416             :                                     const char *pszResampling)
    4417             : 
    4418             : {
    4419             :     enum Method
    4420             :     {
    4421             :         NEAR,
    4422             :         AVERAGE,
    4423             :         AVERAGE_MAGPHASE,
    4424             :         RMS,
    4425             :     };
    4426             : 
    4427           2 :     Method eMethod = NEAR;
    4428           2 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4429             :     {
    4430           0 :         eMethod = NEAR;
    4431             :     }
    4432           2 :     else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
    4433             :     {
    4434           0 :         eMethod = AVERAGE_MAGPHASE;
    4435             :     }
    4436           2 :     else if (EQUAL(pszResampling, "RMS"))
    4437             :     {
    4438           2 :         eMethod = RMS;
    4439             :     }
    4440           0 :     else if (STARTS_WITH_CI(pszResampling, "AVER"))
    4441             :     {
    4442           0 :         eMethod = AVERAGE;
    4443             :     }
    4444             :     else
    4445             :     {
    4446           0 :         CPLError(
    4447             :             CE_Failure, CPLE_NotSupported,
    4448             :             "Resampling method %s is not supported for complex data types. "
    4449             :             "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
    4450             :             pszResampling);
    4451           0 :         return CE_Failure;
    4452             :     }
    4453             : 
    4454           2 :     const int nOXSize = nOvrXSize;
    4455           2 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
    4456             :                                        GDALGetDataTypeSizeBytes(GDT_CFloat32));
    4457           2 :     if (*ppDstBuffer == nullptr)
    4458             :     {
    4459           0 :         return CE_Failure;
    4460             :     }
    4461           2 :     float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
    4462           2 :     *peDstBufferDataType = GDT_CFloat32;
    4463             : 
    4464           2 :     const int nOYSize = nOvrYSize;
    4465           2 :     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
    4466           2 :     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
    4467             : 
    4468             :     /* ==================================================================== */
    4469             :     /*      Loop over destination scanlines.                                */
    4470             :     /* ==================================================================== */
    4471           8 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    4472             :     {
    4473           6 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    4474           6 :         if (nSrcYOff < nChunkYOff)
    4475           0 :             nSrcYOff = nChunkYOff;
    4476             : 
    4477           6 :         int nSrcYOff2 =
    4478           6 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
    4479           6 :         if (nSrcYOff2 == nSrcYOff)
    4480           0 :             nSrcYOff2++;
    4481             : 
    4482           6 :         if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
    4483             :         {
    4484           2 :             if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
    4485           0 :                 nSrcYOff = nSrcHeight - 1;
    4486           2 :             nSrcYOff2 = nSrcHeight;
    4487             :         }
    4488           6 :         if (nSrcYOff2 > nChunkYOff + nChunkYSize)
    4489           0 :             nSrcYOff2 = nChunkYOff + nChunkYSize;
    4490             : 
    4491           6 :         const float *const pafSrcScanline =
    4492           6 :             pafChunk +
    4493           6 :             (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
    4494           6 :         float *const pafDstScanline =
    4495           6 :             pafDstBuffer +
    4496           6 :             static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
    4497             : 
    4498             :         /* --------------------------------------------------------------------
    4499             :          */
    4500             :         /*      Loop over destination pixels */
    4501             :         /* --------------------------------------------------------------------
    4502             :          */
    4503          18 :         for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
    4504             :         {
    4505          12 :             const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
    4506          12 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    4507          12 :             int nSrcXOff2 =
    4508          12 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
    4509          12 :             if (nSrcXOff2 == nSrcXOff)
    4510           0 :                 nSrcXOff2++;
    4511          12 :             if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
    4512             :             {
    4513           6 :                 if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
    4514           0 :                     nSrcXOff = nSrcWidth - 1;
    4515           6 :                 nSrcXOff2 = nSrcWidth;
    4516             :             }
    4517          12 :             const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
    4518             : 
    4519          12 :             if (eMethod == NEAR)
    4520             :             {
    4521           0 :                 pafDstScanline[iDstPixelSZ * 2] =
    4522           0 :                     pafSrcScanline[nSrcXOffSZ * 2];
    4523           0 :                 pafDstScanline[iDstPixelSZ * 2 + 1] =
    4524           0 :                     pafSrcScanline[nSrcXOffSZ * 2 + 1];
    4525             :             }
    4526          12 :             else if (eMethod == AVERAGE_MAGPHASE)
    4527             :             {
    4528           0 :                 double dfTotalR = 0.0;
    4529           0 :                 double dfTotalI = 0.0;
    4530           0 :                 double dfTotalM = 0.0;
    4531           0 :                 size_t nCount = 0;
    4532             : 
    4533           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4534             :                 {
    4535           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4536             :                     {
    4537           0 :                         const double dfR = double(
    4538           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4539           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4540           0 :                                                nSrcWidth * 2]);
    4541           0 :                         const double dfI = double(
    4542           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4543           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4544           0 :                                                nSrcWidth * 2 +
    4545           0 :                                            1]);
    4546           0 :                         dfTotalR += dfR;
    4547           0 :                         dfTotalI += dfI;
    4548           0 :                         dfTotalM += std::hypot(dfR, dfI);
    4549           0 :                         ++nCount;
    4550             :                     }
    4551             :                 }
    4552             : 
    4553           0 :                 CPLAssert(nCount > 0);
    4554           0 :                 if (nCount == 0)
    4555             :                 {
    4556           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4557           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4558             :                 }
    4559             :                 else
    4560             :                 {
    4561           0 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4562           0 :                         dfTotalR / static_cast<double>(nCount));
    4563           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4564           0 :                         dfTotalI / static_cast<double>(nCount));
    4565             :                     const double dfM =
    4566           0 :                         double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
    4567           0 :                                           pafDstScanline[iDstPixelSZ * 2 + 1]));
    4568           0 :                     const double dfDesiredM =
    4569           0 :                         dfTotalM / static_cast<double>(nCount);
    4570           0 :                     double dfRatio = 1.0;
    4571           0 :                     if (dfM != 0.0)
    4572           0 :                         dfRatio = dfDesiredM / dfM;
    4573             : 
    4574           0 :                     pafDstScanline[iDstPixelSZ * 2] *=
    4575           0 :                         static_cast<float>(dfRatio);
    4576           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] *=
    4577           0 :                         static_cast<float>(dfRatio);
    4578             :                 }
    4579             :             }
    4580          12 :             else if (eMethod == RMS)
    4581             :             {
    4582          12 :                 double dfTotalR = 0.0;
    4583          12 :                 double dfTotalI = 0.0;
    4584          12 :                 size_t nCount = 0;
    4585             : 
    4586          36 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4587             :                 {
    4588          72 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4589             :                     {
    4590          48 :                         const double dfR = double(
    4591          48 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4592          48 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4593          48 :                                                nSrcWidth * 2]);
    4594          48 :                         const double dfI = double(
    4595          48 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4596          48 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4597          48 :                                                nSrcWidth * 2 +
    4598          48 :                                            1]);
    4599             : 
    4600          48 :                         dfTotalR += SQUARE(dfR);
    4601          48 :                         dfTotalI += SQUARE(dfI);
    4602             : 
    4603          48 :                         ++nCount;
    4604             :                     }
    4605             :                 }
    4606             : 
    4607          12 :                 CPLAssert(nCount > 0);
    4608          12 :                 if (nCount == 0)
    4609             :                 {
    4610           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4611           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4612             :                 }
    4613             :                 else
    4614             :                 {
    4615             :                     /* compute RMS */
    4616          12 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4617          12 :                         sqrt(dfTotalR / static_cast<double>(nCount)));
    4618          12 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4619          12 :                         sqrt(dfTotalI / static_cast<double>(nCount)));
    4620             :                 }
    4621             :             }
    4622           0 :             else if (eMethod == AVERAGE)
    4623             :             {
    4624           0 :                 double dfTotalR = 0.0;
    4625           0 :                 double dfTotalI = 0.0;
    4626           0 :                 size_t nCount = 0;
    4627             : 
    4628           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4629             :                 {
    4630           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4631             :                     {
    4632             :                         // TODO(schwehr): Maybe use std::complex?
    4633           0 :                         dfTotalR += double(
    4634           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4635           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4636           0 :                                                nSrcWidth * 2]);
    4637           0 :                         dfTotalI += double(
    4638           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4639           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4640           0 :                                                nSrcWidth * 2 +
    4641           0 :                                            1]);
    4642           0 :                         ++nCount;
    4643             :                     }
    4644             :                 }
    4645             : 
    4646           0 :                 CPLAssert(nCount > 0);
    4647           0 :                 if (nCount == 0)
    4648             :                 {
    4649           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4650           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4651             :                 }
    4652             :                 else
    4653             :                 {
    4654           0 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4655           0 :                         dfTotalR / static_cast<double>(nCount));
    4656           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4657           0 :                         dfTotalI / static_cast<double>(nCount));
    4658             :                 }
    4659             :             }
    4660             :         }
    4661             :     }
    4662             : 
    4663           2 :     return CE_None;
    4664             : }
    4665             : 
    4666             : /************************************************************************/
    4667             : /*                  GDALRegenerateCascadingOverviews()                  */
    4668             : /*                                                                      */
    4669             : /*      Generate a list of overviews in order from largest to           */
    4670             : /*      smallest, computing each from the next larger.                  */
    4671             : /************************************************************************/
    4672             : 
    4673          44 : static CPLErr GDALRegenerateCascadingOverviews(
    4674             :     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
    4675             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    4676             :     void *pProgressData, CSLConstList papszOptions)
    4677             : 
    4678             : {
    4679             :     /* -------------------------------------------------------------------- */
    4680             :     /*      First, we must put the overviews in order from largest to       */
    4681             :     /*      smallest.                                                       */
    4682             :     /* -------------------------------------------------------------------- */
    4683         127 :     for (int i = 0; i < nOverviews - 1; ++i)
    4684             :     {
    4685         292 :         for (int j = 0; j < nOverviews - i - 1; ++j)
    4686             :         {
    4687         209 :             if (papoOvrBands[j]->GetXSize() *
    4688         209 :                     static_cast<float>(papoOvrBands[j]->GetYSize()) <
    4689         209 :                 papoOvrBands[j + 1]->GetXSize() *
    4690         209 :                     static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
    4691             :             {
    4692           0 :                 GDALRasterBand *poTempBand = papoOvrBands[j];
    4693           0 :                 papoOvrBands[j] = papoOvrBands[j + 1];
    4694           0 :                 papoOvrBands[j + 1] = poTempBand;
    4695             :             }
    4696             :         }
    4697             :     }
    4698             : 
    4699             :     /* -------------------------------------------------------------------- */
    4700             :     /*      Count total pixels so we can prepare appropriate scaled         */
    4701             :     /*      progress functions.                                             */
    4702             :     /* -------------------------------------------------------------------- */
    4703          44 :     double dfTotalPixels = 0.0;
    4704             : 
    4705         171 :     for (int i = 0; i < nOverviews; ++i)
    4706             :     {
    4707         127 :         dfTotalPixels += papoOvrBands[i]->GetXSize() *
    4708         127 :                          static_cast<double>(papoOvrBands[i]->GetYSize());
    4709             :     }
    4710             : 
    4711             :     /* -------------------------------------------------------------------- */
    4712             :     /*      Generate all the bands.                                         */
    4713             :     /* -------------------------------------------------------------------- */
    4714          44 :     double dfPixelsProcessed = 0.0;
    4715             : 
    4716          88 :     CPLStringList aosOptions(papszOptions);
    4717          44 :     aosOptions.SetNameValue("CASCADING", "YES");
    4718         171 :     for (int i = 0; i < nOverviews; ++i)
    4719             :     {
    4720         127 :         GDALRasterBand *poBaseBand = poSrcBand;
    4721         127 :         if (i != 0)
    4722          83 :             poBaseBand = papoOvrBands[i - 1];
    4723             : 
    4724         127 :         double dfPixels = papoOvrBands[i]->GetXSize() *
    4725         127 :                           static_cast<double>(papoOvrBands[i]->GetYSize());
    4726             : 
    4727         254 :         void *pScaledProgressData = GDALCreateScaledProgress(
    4728             :             dfPixelsProcessed / dfTotalPixels,
    4729         127 :             (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
    4730             :             pProgressData);
    4731             : 
    4732         254 :         const CPLErr eErr = GDALRegenerateOverviewsEx(
    4733             :             poBaseBand, 1,
    4734         127 :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
    4735             :             pszResampling, GDALScaledProgress, pScaledProgressData,
    4736         127 :             aosOptions.List());
    4737         127 :         GDALDestroyScaledProgress(pScaledProgressData);
    4738             : 
    4739         127 :         if (eErr != CE_None)
    4740           0 :             return eErr;
    4741             : 
    4742         127 :         dfPixelsProcessed += dfPixels;
    4743             : 
    4744             :         // Only do the bit2grayscale promotion on the base band.
    4745         127 :         if (STARTS_WITH_CI(pszResampling,
    4746             :                            "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
    4747           8 :             pszResampling = "AVERAGE";
    4748             :     }
    4749             : 
    4750          44 :     return CE_None;
    4751             : }
    4752             : 
    4753             : /************************************************************************/
    4754             : /*                      GDALGetResampleFunction()                       */
    4755             : /************************************************************************/
    4756             : 
    4757       19281 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
    4758             :                                              int *pnRadius)
    4759             : {
    4760       19281 :     if (pnRadius)
    4761       19281 :         *pnRadius = 0;
    4762       19281 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4763         545 :         return GDALResampleChunk_Near;
    4764       18736 :     else if (STARTS_WITH_CI(pszResampling, "AVER") ||
    4765        7508 :              EQUAL(pszResampling, "RMS"))
    4766       11293 :         return GDALResampleChunk_AverageOrRMS;
    4767        7443 :     else if (EQUAL(pszResampling, "GAUSS"))
    4768             :     {
    4769          26 :         if (pnRadius)
    4770          26 :             *pnRadius = 1;
    4771          26 :         return GDALResampleChunk_Gauss;
    4772             :     }
    4773        7417 :     else if (EQUAL(pszResampling, "MODE"))
    4774         142 :         return GDALResampleChunk_Mode;
    4775        7275 :     else if (EQUAL(pszResampling, "CUBIC"))
    4776             :     {
    4777        1648 :         if (pnRadius)
    4778        1648 :             *pnRadius = GWKGetFilterRadius(GRA_Cubic);
    4779        1648 :         return GDALResampleChunk_Convolution;
    4780             :     }
    4781        5627 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    4782             :     {
    4783          60 :         if (pnRadius)
    4784          60 :             *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
    4785          60 :         return GDALResampleChunk_Convolution;
    4786             :     }
    4787        5567 :     else if (EQUAL(pszResampling, "LANCZOS"))
    4788             :     {
    4789          50 :         if (pnRadius)
    4790          50 :             *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
    4791          50 :         return GDALResampleChunk_Convolution;
    4792             :     }
    4793        5517 :     else if (EQUAL(pszResampling, "BILINEAR"))
    4794             :     {
    4795        5517 :         if (pnRadius)
    4796        5517 :             *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
    4797        5517 :         return GDALResampleChunk_Convolution;
    4798             :     }
    4799             :     else
    4800             :     {
    4801           0 :         CPLError(
    4802             :             CE_Failure, CPLE_AppDefined,
    4803             :             "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
    4804             :             pszResampling);
    4805           0 :         return nullptr;
    4806             :     }
    4807             : }
    4808             : 
    4809             : /************************************************************************/
    4810             : /*                       GDALGetOvrWorkDataType()                       */
    4811             : /************************************************************************/
    4812             : 
    4813       19163 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
    4814             :                                     GDALDataType eSrcDataType)
    4815             : {
    4816       19163 :     if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
    4817             :     {
    4818         679 :         return eSrcDataType;
    4819             :     }
    4820       18484 :     else if (eSrcDataType == GDT_UInt8 &&
    4821       17911 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4822        6781 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4823        5375 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4824        5355 :               EQUAL(pszResampling, "LANCZOS") ||
    4825        5348 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4826             :     {
    4827       17904 :         return GDT_UInt8;
    4828             :     }
    4829         580 :     else if (eSrcDataType == GDT_UInt16 &&
    4830         131 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4831         126 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4832           8 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4833           6 :               EQUAL(pszResampling, "LANCZOS") ||
    4834           3 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4835             :     {
    4836         131 :         return GDT_UInt16;
    4837             :     }
    4838         449 :     else if (EQUAL(pszResampling, "GAUSS"))
    4839          20 :         return GDT_Float64;
    4840             : 
    4841         429 :     if (eSrcDataType == GDT_UInt8 || eSrcDataType == GDT_Int8 ||
    4842         428 :         eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
    4843             :         eSrcDataType == GDT_Float32)
    4844             :     {
    4845         277 :         return GDT_Float32;
    4846             :     }
    4847         152 :     return GDT_Float64;
    4848             : }
    4849             : 
    4850             : namespace
    4851             : {
    4852             : // Structure to hold a pointer to free with CPLFree()
    4853             : struct PointerHolder
    4854             : {
    4855             :     void *ptr = nullptr;
    4856             : 
    4857        4087 :     template <class T> explicit PointerHolder(T *&ptrIn) : ptr(ptrIn)
    4858             :     {
    4859        4087 :         ptrIn = nullptr;
    4860        4087 :     }
    4861             : 
    4862             :     template <class T>
    4863          32 :     explicit PointerHolder(std::unique_ptr<T, VSIFreeReleaser> ptrIn)
    4864          32 :         : ptr(ptrIn.release())
    4865             :     {
    4866          32 :     }
    4867             : 
    4868        4119 :     ~PointerHolder()
    4869        4119 :     {
    4870        4119 :         CPLFree(ptr);
    4871        4119 :     }
    4872             : 
    4873             :     PointerHolder(const PointerHolder &) = delete;
    4874             :     PointerHolder &operator=(const PointerHolder &) = delete;
    4875             : };
    4876             : }  // namespace
    4877             : 
    4878             : /************************************************************************/
    4879             : /*                      GDALRegenerateOverviews()                       */
    4880             : /************************************************************************/
    4881             : 
    4882             : /**
    4883             :  * \brief Generate downsampled overviews.
    4884             :  *
    4885             :  * This function will generate one or more overview images from a base image
    4886             :  * using the requested downsampling algorithm.  Its primary use is for
    4887             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4888             :  * used to generate downsampled images in one file from another outside the
    4889             :  * overview architecture.
    4890             :  *
    4891             :  * The output bands need to exist in advance.
    4892             :  *
    4893             :  * The full set of resampling algorithms is documented in
    4894             :  * GDALDataset::BuildOverviews().
    4895             :  *
    4896             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4897             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4898             :  * considered as the nodata value and not each value of the triplet
    4899             :  * independently per band.
    4900             :  *
    4901             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4902             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4903             :  * overview computation.
    4904             :  *
    4905             :  * @param hSrcBand the source (base level) band.
    4906             :  * @param nOverviewCount the number of downsampled bands being generated.
    4907             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4908             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4909             :  * @param pfnProgress progress report function.
    4910             :  * @param pProgressData progress function callback data.
    4911             :  * @return CE_None on success or CE_Failure on failure.
    4912             :  */
    4913         113 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
    4914             :                                GDALRasterBandH *pahOvrBands,
    4915             :                                const char *pszResampling,
    4916             :                                GDALProgressFunc pfnProgress,
    4917             :                                void *pProgressData)
    4918             : 
    4919             : {
    4920         113 :     return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
    4921             :                                      pszResampling, pfnProgress, pProgressData,
    4922         113 :                                      nullptr);
    4923             : }
    4924             : 
    4925             : /************************************************************************/
    4926             : /*                     GDALRegenerateOverviewsEx()                      */
    4927             : /************************************************************************/
    4928             : 
    4929             : constexpr int RADIUS_TO_DIAMETER = 2;
    4930             : 
    4931             : /**
    4932             :  * \brief Generate downsampled overviews.
    4933             :  *
    4934             :  * This function will generate one or more overview images from a base image
    4935             :  * using the requested downsampling algorithm.  Its primary use is for
    4936             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4937             :  * used to generate downsampled images in one file from another outside the
    4938             :  * overview architecture.
    4939             :  *
    4940             :  * The output bands need to exist in advance.
    4941             :  *
    4942             :  * The full set of resampling algorithms is documented in
    4943             :  * GDALDataset::BuildOverviews().
    4944             :  *
    4945             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4946             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4947             :  * considered as the nodata value and not each value of the triplet
    4948             :  * independently per band.
    4949             :  *
    4950             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4951             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4952             :  * overview computation.
    4953             :  *
    4954             :  * @param hSrcBand the source (base level) band.
    4955             :  * @param nOverviewCount the number of downsampled bands being generated.
    4956             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4957             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4958             :  * @param pfnProgress progress report function.
    4959             :  * @param pProgressData progress function callback data.
    4960             :  * @param papszOptions NULL terminated list of options as key=value pairs, or
    4961             :  * NULL
    4962             :  * @return CE_None on success or CE_Failure on failure.
    4963             :  * @since GDAL 3.6
    4964             :  */
    4965         794 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
    4966             :                                  GDALRasterBandH *pahOvrBands,
    4967             :                                  const char *pszResampling,
    4968             :                                  GDALProgressFunc pfnProgress,
    4969             :                                  void *pProgressData, CSLConstList papszOptions)
    4970             : 
    4971             : {
    4972         794 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4973         794 :     GDALRasterBand **papoOvrBands =
    4974             :         reinterpret_cast<GDALRasterBand **>(pahOvrBands);
    4975             : 
    4976         794 :     if (pfnProgress == nullptr)
    4977         102 :         pfnProgress = GDALDummyProgress;
    4978             : 
    4979         794 :     if (EQUAL(pszResampling, "NONE"))
    4980          51 :         return CE_None;
    4981             : 
    4982         743 :     int nKernelRadius = 0;
    4983             :     GDALResampleFunction pfnResampleFn =
    4984         743 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4985             : 
    4986         743 :     if (pfnResampleFn == nullptr)
    4987           0 :         return CE_Failure;
    4988             : 
    4989             :     /* -------------------------------------------------------------------- */
    4990             :     /*      Check color tables...                                           */
    4991             :     /* -------------------------------------------------------------------- */
    4992         743 :     GDALColorTable *poColorTable = nullptr;
    4993             : 
    4994         520 :     if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4995        1564 :          EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
    4996         312 :         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4997             :     {
    4998           9 :         poColorTable = poSrcBand->GetColorTable();
    4999           9 :         if (poColorTable != nullptr)
    5000             :         {
    5001           9 :             if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
    5002             :             {
    5003           0 :                 CPLError(CE_Warning, CPLE_AppDefined,
    5004             :                          "Computing overviews on palette index raster bands "
    5005             :                          "with a palette whose color interpretation is not RGB "
    5006             :                          "will probably lead to unexpected results.");
    5007           0 :                 poColorTable = nullptr;
    5008             :             }
    5009           9 :             else if (poColorTable->IsIdentity())
    5010             :             {
    5011           0 :                 poColorTable = nullptr;
    5012             :             }
    5013             :         }
    5014             :         else
    5015             :         {
    5016           0 :             CPLError(CE_Warning, CPLE_AppDefined,
    5017             :                      "Computing overviews on palette index raster bands "
    5018             :                      "without a palette will probably lead to unexpected "
    5019             :                      "results.");
    5020             :         }
    5021             :     }
    5022             :     // Not ready yet
    5023        2148 :     else if ((EQUAL(pszResampling, "CUBIC") ||
    5024         680 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    5025         680 :               EQUAL(pszResampling, "LANCZOS") ||
    5026        1494 :               EQUAL(pszResampling, "BILINEAR")) &&
    5027          80 :              poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    5028             :     {
    5029           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    5030             :                  "Computing %s overviews on palette index raster bands "
    5031             :                  "will probably lead to unexpected results.",
    5032             :                  pszResampling);
    5033             :     }
    5034             : 
    5035             :     // If we have a nodata mask and we are doing something more complicated
    5036             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5037             : 
    5038         743 :     GDALRasterBand *poMaskBand = nullptr;
    5039         743 :     bool bUseNoDataMask = false;
    5040         743 :     bool bCanUseCascaded = true;
    5041             : 
    5042         743 :     if (!STARTS_WITH_CI(pszResampling, "NEAR"))
    5043             :     {
    5044             :         // Special case if we are an alpha/mask band. We want it to be
    5045             :         // considered as the mask band to avoid alpha=0 to be taken into account
    5046             :         // in average computation.
    5047         392 :         if (poSrcBand->IsMaskBand())
    5048             :         {
    5049          51 :             poMaskBand = poSrcBand;
    5050          51 :             bUseNoDataMask = true;
    5051             :         }
    5052             :         else
    5053             :         {
    5054         341 :             poMaskBand = poSrcBand->GetMaskBand();
    5055         341 :             const int nMaskFlags = poSrcBand->GetMaskFlags();
    5056         341 :             bCanUseCascaded =
    5057         341 :                 (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
    5058         341 :             bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
    5059             :         }
    5060             :     }
    5061             : 
    5062         743 :     int nHasNoData = 0;
    5063         743 :     const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
    5064         743 :     const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    5065             :     const bool bPropagateNoData =
    5066         743 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5067             : 
    5068         811 :     if (poSrcBand->GetBand() == 1 && bUseNoDataMask &&
    5069          68 :         CSLFetchNameValue(papszOptions, "CASCADING") == nullptr)
    5070             :     {
    5071         112 :         std::string osDetailMessage;
    5072          56 :         if (poSrcBand->HasConflictingMaskSources(&osDetailMessage, false))
    5073             :         {
    5074           2 :             CPLError(
    5075             :                 CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
    5076             :                 bHasNoData
    5077             :                     ? "Only the nodata value will be taken into account."
    5078             :                     : "Only the first listed one will be taken into account.");
    5079             :         }
    5080             :     }
    5081             : 
    5082             :     /* -------------------------------------------------------------------- */
    5083             :     /*      If we are operating on multiple overviews, and using            */
    5084             :     /*      averaging, lets do them in cascading order to reduce the        */
    5085             :     /*      amount of computation.                                          */
    5086             :     /* -------------------------------------------------------------------- */
    5087             : 
    5088             :     // In case the mask made be computed from another band of the dataset,
    5089             :     // we can't use cascaded generation, as the computation of the overviews
    5090             :     // of the band used for the mask band may not have yet occurred (#3033).
    5091         743 :     if ((STARTS_WITH_CI(pszResampling, "AVER") ||
    5092         520 :          EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
    5093         489 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    5094         435 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    5095         743 :          EQUAL(pszResampling, "MODE")) &&
    5096          44 :         nOverviewCount > 1 && bCanUseCascaded)
    5097          44 :         return GDALRegenerateCascadingOverviews(
    5098             :             poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
    5099          44 :             pProgressData, papszOptions);
    5100             : 
    5101             :     /* -------------------------------------------------------------------- */
    5102             :     /*      Setup one horizontal swath to read from the raw buffer.         */
    5103             :     /* -------------------------------------------------------------------- */
    5104         699 :     int nFRXBlockSize = 0;
    5105         699 :     int nFRYBlockSize = 0;
    5106         699 :     poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
    5107             : 
    5108         699 :     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
    5109        1047 :     const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
    5110         997 :                                        EQUAL(pszResampling, "MODE") ||
    5111         298 :                                        !GDALDataTypeIsComplex(eSrcDataType);
    5112             :     const GDALDataType eWrkDataType =
    5113             :         bUseGenericResampleFn
    5114         699 :             ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
    5115         699 :             : GDT_CFloat32;
    5116             : 
    5117         699 :     const int nWidth = poSrcBand->GetXSize();
    5118         699 :     const int nHeight = poSrcBand->GetYSize();
    5119             : 
    5120         699 :     int nMaxOvrFactor = 1;
    5121        1521 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    5122             :     {
    5123         822 :         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
    5124         822 :         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
    5125         822 :         nMaxOvrFactor = std::max(
    5126             :             nMaxOvrFactor,
    5127         822 :             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
    5128         822 :         nMaxOvrFactor = std::max(
    5129             :             nMaxOvrFactor,
    5130         822 :             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
    5131             :     }
    5132             : 
    5133         699 :     int nFullResYChunk = nFRYBlockSize;
    5134         699 :     int nMaxChunkYSizeQueried = 0;
    5135             : 
    5136             :     const auto UpdateChunkHeightAndGetChunkSize =
    5137        9441 :         [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
    5138       76489 :          eWrkDataType, nWidth]()
    5139             :     {
    5140             :         // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
    5141             :         // + nFullResYChunk) / nMaxOvrFactor)
    5142        9441 :         if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
    5143             :         {
    5144           1 :             return GINTBIG_MAX;
    5145             :         }
    5146        9440 :         nFullResYChunk =
    5147        9440 :             std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
    5148        9440 :         if ((nKernelRadius > 0 &&
    5149         970 :              nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
    5150        9440 :             nFullResYChunk >
    5151        9440 :                 INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
    5152             :         {
    5153           0 :             return GINTBIG_MAX;
    5154             :         }
    5155        9440 :         nMaxChunkYSizeQueried =
    5156        9440 :             nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
    5157        9440 :         if (GDALGetDataTypeSizeBytes(eWrkDataType) >
    5158        9440 :             std::numeric_limits<int64_t>::max() /
    5159        9440 :                 (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
    5160             :         {
    5161           1 :             return GINTBIG_MAX;
    5162             :         }
    5163        9439 :         return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
    5164        9439 :                nMaxChunkYSizeQueried * nWidth;
    5165         699 :     };
    5166             : 
    5167             :     const char *pszChunkYSize =
    5168         699 :         CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
    5169             : #ifndef __COVERITY__
    5170             :     // Only configurable for debug / testing
    5171         699 :     if (pszChunkYSize)
    5172             :     {
    5173           0 :         nFullResYChunk = atoi(pszChunkYSize);
    5174             :     }
    5175             : #endif
    5176             : 
    5177             :     // Only configurable for debug / testing
    5178             :     const int nChunkMaxSize =
    5179         699 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    5180             : 
    5181         699 :     auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
    5182         699 :     if (nChunkSize > nChunkMaxSize)
    5183             :     {
    5184          15 :         if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
    5185          44 :             !GDALDataTypeIsComplex(eSrcDataType) &&
    5186          14 :             (!STARTS_WITH_CI(pszResampling, "AVER") ||
    5187           2 :              EQUAL(pszResampling, "AVERAGE")))
    5188             :         {
    5189             :             // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
    5190             :             // which use a block based strategy, which is much less memory
    5191             :             // hungry.
    5192          14 :             return GDALRegenerateOverviewsMultiBand(
    5193             :                 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
    5194          14 :                 pfnProgress, pProgressData, papszOptions);
    5195             :         }
    5196           1 :         else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
    5197             :         {
    5198           0 :             return GDALRegenerateCascadingOverviews(
    5199             :                 poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
    5200           0 :                 pfnProgress, pProgressData, papszOptions);
    5201             :         }
    5202             :     }
    5203         684 :     else if (pszChunkYSize == nullptr)
    5204             :     {
    5205             :         // Try to get as close as possible to nChunkMaxSize
    5206        9426 :         while (nChunkSize < nChunkMaxSize / 2)
    5207             :         {
    5208        8742 :             nFullResYChunk *= 2;
    5209        8742 :             nChunkSize = UpdateChunkHeightAndGetChunkSize();
    5210             :         }
    5211             :     }
    5212             : 
    5213             :     // Structure describing a resampling job
    5214             :     struct OvrJob
    5215             :     {
    5216             :         // Buffers to free when job is finished
    5217             :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
    5218             :         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
    5219             :         std::unique_ptr<PointerHolder> oDstBufferHolder{};
    5220             : 
    5221             :         GDALRasterBand *poDstBand = nullptr;
    5222             : 
    5223             :         // Input parameters of pfnResampleFn
    5224             :         GDALResampleFunction pfnResampleFn = nullptr;
    5225             :         int nSrcWidth = 0;
    5226             :         int nSrcHeight = 0;
    5227             :         int nDstWidth = 0;
    5228             :         GDALOverviewResampleArgs args{};
    5229             :         const void *pChunk = nullptr;
    5230             :         bool bUseGenericResampleFn = false;
    5231             : 
    5232             :         // Output values of resampling function
    5233             :         CPLErr eErr = CE_Failure;
    5234             :         void *pDstBuffer = nullptr;
    5235             :         GDALDataType eDstBufferDataType = GDT_Unknown;
    5236             : 
    5237           0 :         void SetSrcMaskBufferHolder(
    5238             :             const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
    5239             :         {
    5240           0 :             oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
    5241           0 :         }
    5242             : 
    5243           0 :         void SetSrcBufferHolder(
    5244             :             const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
    5245             :         {
    5246           0 :             oSrcBufferHolder = oSrcBufferHolderIn;
    5247           0 :         }
    5248             : 
    5249         791 :         void NotifyFinished()
    5250             :         {
    5251        1582 :             std::lock_guard guard(mutex);
    5252         791 :             bFinished = true;
    5253         791 :             cv.notify_one();
    5254         791 :         }
    5255             : 
    5256           0 :         bool IsFinished()
    5257             :         {
    5258           0 :             std::lock_guard guard(mutex);
    5259           0 :             return bFinished;
    5260             :         }
    5261             : 
    5262           0 :         void WaitFinished()
    5263             :         {
    5264           0 :             std::unique_lock oGuard(mutex);
    5265           0 :             while (!bFinished)
    5266             :             {
    5267           0 :                 cv.wait(oGuard);
    5268             :             }
    5269           0 :         }
    5270             : 
    5271             :       private:
    5272             :         // Synchronization
    5273             :         bool bFinished = false;
    5274             :         std::mutex mutex{};
    5275             :         std::condition_variable cv{};
    5276             :     };
    5277             : 
    5278             :     // Thread function to resample
    5279         791 :     const auto JobResampleFunc = [](void *pData)
    5280             :     {
    5281         791 :         OvrJob *poJob = static_cast<OvrJob *>(pData);
    5282             : 
    5283         791 :         if (poJob->bUseGenericResampleFn)
    5284             :         {
    5285         789 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    5286             :                                                &(poJob->pDstBuffer),
    5287             :                                                &(poJob->eDstBufferDataType));
    5288             :         }
    5289             :         else
    5290             :         {
    5291           2 :             poJob->eErr = GDALResampleChunkC32R(
    5292             :                 poJob->nSrcWidth, poJob->nSrcHeight,
    5293           2 :                 static_cast<const float *>(poJob->pChunk),
    5294             :                 poJob->args.nChunkYOff, poJob->args.nChunkYSize,
    5295             :                 poJob->args.nDstYOff, poJob->args.nDstYOff2,
    5296             :                 poJob->args.nOvrXSize, poJob->args.nOvrYSize,
    5297             :                 &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
    5298             :                 poJob->args.pszResampling);
    5299             :         }
    5300             : 
    5301         791 :         auto pDstBuffer = poJob->pDstBuffer;
    5302         791 :         poJob->oDstBufferHolder = std::make_unique<PointerHolder>(pDstBuffer);
    5303             : 
    5304         791 :         poJob->NotifyFinished();
    5305         791 :     };
    5306             : 
    5307             :     // Function to write resample data to target band
    5308         791 :     const auto WriteJobData = [](const OvrJob *poJob)
    5309             :     {
    5310        1582 :         return poJob->poDstBand->RasterIO(
    5311         791 :             GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
    5312         791 :             poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    5313         791 :             poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    5314         791 :             poJob->eDstBufferDataType, 0, 0, nullptr);
    5315             :     };
    5316             : 
    5317             :     // Wait for completion of oldest job and serialize it
    5318             :     const auto WaitAndFinalizeOldestJob =
    5319           0 :         [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    5320             :     {
    5321           0 :         auto poOldestJob = jobList.front().get();
    5322           0 :         poOldestJob->WaitFinished();
    5323           0 :         CPLErr l_eErr = poOldestJob->eErr;
    5324           0 :         if (l_eErr == CE_None)
    5325             :         {
    5326           0 :             l_eErr = WriteJobData(poOldestJob);
    5327             :         }
    5328             : 
    5329           0 :         jobList.pop_front();
    5330           0 :         return l_eErr;
    5331             :     };
    5332             : 
    5333             :     // Queue of jobs
    5334        1370 :     std::list<std::unique_ptr<OvrJob>> jobList;
    5335             : 
    5336         685 :     GByte *pabyChunkNodataMask = nullptr;
    5337         685 :     void *pChunk = nullptr;
    5338             : 
    5339         685 :     const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
    5340             :                                            /* bDefaultToAllCPUs=*/false);
    5341             :     auto poThreadPool =
    5342         685 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5343             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5344        1370 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5345             : 
    5346             :     /* -------------------------------------------------------------------- */
    5347             :     /*      Loop over image operating on chunks.                            */
    5348             :     /* -------------------------------------------------------------------- */
    5349         685 :     int nChunkYOff = 0;
    5350         685 :     CPLErr eErr = CE_None;
    5351             : 
    5352        1375 :     for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
    5353         690 :          nChunkYOff += nFullResYChunk)
    5354             :     {
    5355         690 :         if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
    5356             :                          pProgressData))
    5357             :         {
    5358           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5359           0 :             eErr = CE_Failure;
    5360             :         }
    5361             : 
    5362         690 :         if (nFullResYChunk + nChunkYOff > nHeight)
    5363         682 :             nFullResYChunk = nHeight - nChunkYOff;
    5364             : 
    5365         690 :         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
    5366         690 :         int nChunkYSizeQueried =
    5367         690 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    5368         690 :         if (nChunkYOffQueried < 0)
    5369             :         {
    5370          83 :             nChunkYSizeQueried += nChunkYOffQueried;
    5371          83 :             nChunkYOffQueried = 0;
    5372             :         }
    5373         690 :         if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
    5374          83 :             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
    5375             : 
    5376             :         // Avoid accumulating too many tasks and exhaust RAM
    5377             :         // Try to complete already finished jobs
    5378         690 :         while (eErr == CE_None && !jobList.empty())
    5379             :         {
    5380           0 :             auto poOldestJob = jobList.front().get();
    5381           0 :             if (!poOldestJob->IsFinished())
    5382           0 :                 break;
    5383           0 :             eErr = poOldestJob->eErr;
    5384           0 :             if (eErr == CE_None)
    5385             :             {
    5386           0 :                 eErr = WriteJobData(poOldestJob);
    5387             :             }
    5388             : 
    5389           0 :             jobList.pop_front();
    5390             :         }
    5391             : 
    5392             :         // And in case we have saturated the number of threads,
    5393             :         // wait for completion of tasks to go below the threshold.
    5394        1380 :         while (eErr == CE_None &&
    5395         690 :                jobList.size() >= static_cast<size_t>(nThreads))
    5396             :         {
    5397           0 :             eErr = WaitAndFinalizeOldestJob(jobList);
    5398             :         }
    5399             : 
    5400             :         // (Re)allocate buffers if needed
    5401         690 :         if (pChunk == nullptr)
    5402             :         {
    5403         685 :             pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    5404             :                                          nMaxChunkYSizeQueried, nWidth);
    5405             :         }
    5406         690 :         if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
    5407             :         {
    5408         139 :             pabyChunkNodataMask = static_cast<GByte *>(
    5409         139 :                 VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
    5410             :         }
    5411             : 
    5412         690 :         if (pChunk == nullptr ||
    5413         139 :             (bUseNoDataMask && pabyChunkNodataMask == nullptr))
    5414             :         {
    5415           0 :             CPLFree(pChunk);
    5416           0 :             CPLFree(pabyChunkNodataMask);
    5417           0 :             return CE_Failure;
    5418             :         }
    5419             : 
    5420             :         // Read chunk.
    5421         690 :         if (eErr == CE_None)
    5422         690 :             eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    5423             :                                        nChunkYSizeQueried, pChunk, nWidth,
    5424             :                                        nChunkYSizeQueried, eWrkDataType, 0, 0,
    5425             :                                        nullptr);
    5426         690 :         if (eErr == CE_None && bUseNoDataMask)
    5427         139 :             eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    5428             :                                         nChunkYSizeQueried, pabyChunkNodataMask,
    5429             :                                         nWidth, nChunkYSizeQueried, GDT_UInt8,
    5430             :                                         0, 0, nullptr);
    5431             : 
    5432             :         // Special case to promote 1bit data to 8bit 0/255 values.
    5433         690 :         if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
    5434             :         {
    5435           9 :             if (eWrkDataType == GDT_Float32)
    5436             :             {
    5437           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    5438           0 :                 for (size_t i = 0;
    5439           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5440             :                 {
    5441           0 :                     if (pafChunk[i] == 1.0f)
    5442           0 :                         pafChunk[i] = 255.0f;
    5443             :                 }
    5444             :             }
    5445           9 :             else if (eWrkDataType == GDT_UInt8)
    5446             :             {
    5447           9 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    5448      168417 :                 for (size_t i = 0;
    5449      168417 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5450             :                 {
    5451      168408 :                     if (pabyChunk[i] == 1)
    5452      127437 :                         pabyChunk[i] = 255;
    5453             :                 }
    5454             :             }
    5455           0 :             else if (eWrkDataType == GDT_UInt16)
    5456             :             {
    5457           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    5458           0 :                 for (size_t i = 0;
    5459           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5460             :                 {
    5461           0 :                     if (pasChunk[i] == 1)
    5462           0 :                         pasChunk[i] = 255;
    5463             :                 }
    5464             :             }
    5465           0 :             else if (eWrkDataType == GDT_Float64)
    5466             :             {
    5467           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    5468           0 :                 for (size_t i = 0;
    5469           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5470             :                 {
    5471           0 :                     if (padfChunk[i] == 1.0)
    5472           0 :                         padfChunk[i] = 255.0;
    5473             :                 }
    5474             :             }
    5475             :             else
    5476             :             {
    5477           0 :                 CPLAssert(false);
    5478             :             }
    5479             :         }
    5480         681 :         else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
    5481             :         {
    5482           0 :             if (eWrkDataType == GDT_Float32)
    5483             :             {
    5484           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    5485           0 :                 for (size_t i = 0;
    5486           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5487             :                 {
    5488           0 :                     if (pafChunk[i] == 1.0f)
    5489           0 :                         pafChunk[i] = 0.0f;
    5490           0 :                     else if (pafChunk[i] == 0.0f)
    5491           0 :                         pafChunk[i] = 255.0f;
    5492             :                 }
    5493             :             }
    5494           0 :             else if (eWrkDataType == GDT_UInt8)
    5495             :             {
    5496           0 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    5497           0 :                 for (size_t i = 0;
    5498           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5499             :                 {
    5500           0 :                     if (pabyChunk[i] == 1)
    5501           0 :                         pabyChunk[i] = 0;
    5502           0 :                     else if (pabyChunk[i] == 0)
    5503           0 :                         pabyChunk[i] = 255;
    5504             :                 }
    5505             :             }
    5506           0 :             else if (eWrkDataType == GDT_UInt16)
    5507             :             {
    5508           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    5509           0 :                 for (size_t i = 0;
    5510           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5511             :                 {
    5512           0 :                     if (pasChunk[i] == 1)
    5513           0 :                         pasChunk[i] = 0;
    5514           0 :                     else if (pasChunk[i] == 0)
    5515           0 :                         pasChunk[i] = 255;
    5516             :                 }
    5517             :             }
    5518           0 :             else if (eWrkDataType == GDT_Float64)
    5519             :             {
    5520           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    5521           0 :                 for (size_t i = 0;
    5522           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5523             :                 {
    5524           0 :                     if (padfChunk[i] == 1.0)
    5525           0 :                         padfChunk[i] = 0.0;
    5526           0 :                     else if (padfChunk[i] == 0.0)
    5527           0 :                         padfChunk[i] = 255.0;
    5528             :                 }
    5529             :             }
    5530             :             else
    5531             :             {
    5532           0 :                 CPLAssert(false);
    5533             :             }
    5534             :         }
    5535             : 
    5536         690 :         auto pChunkRaw = pChunk;
    5537         690 :         auto pabyChunkNodataMaskRaw = pabyChunkNodataMask;
    5538         690 :         std::shared_ptr<PointerHolder> oSrcBufferHolder;
    5539         690 :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder;
    5540         690 :         if (poJobQueue)
    5541             :         {
    5542           0 :             oSrcBufferHolder = std::make_shared<PointerHolder>(pChunk);
    5543             :             oSrcMaskBufferHolder =
    5544           0 :                 std::make_shared<PointerHolder>(pabyChunkNodataMask);
    5545             :         }
    5546             : 
    5547        1481 :         for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
    5548             :              ++iOverview)
    5549             :         {
    5550         791 :             GDALRasterBand *poDstBand = papoOvrBands[iOverview];
    5551         791 :             const int nDstWidth = poDstBand->GetXSize();
    5552         791 :             const int nDstHeight = poDstBand->GetYSize();
    5553             : 
    5554         791 :             const double dfXRatioDstToSrc =
    5555         791 :                 static_cast<double>(nWidth) / nDstWidth;
    5556         791 :             const double dfYRatioDstToSrc =
    5557         791 :                 static_cast<double>(nHeight) / nDstHeight;
    5558             : 
    5559             :             /* --------------------------------------------------------------------
    5560             :              */
    5561             :             /*      Figure out the line to start writing to, and the first line
    5562             :              */
    5563             :             /*      to not write to.  In theory this approach should ensure that
    5564             :              */
    5565             :             /*      every output line will be written if all input chunks are */
    5566             :             /*      processed. */
    5567             :             /* --------------------------------------------------------------------
    5568             :              */
    5569         791 :             int nDstYOff =
    5570         791 :                 static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
    5571         791 :             if (nDstYOff == nDstHeight)
    5572           0 :                 continue;
    5573         791 :             int nDstYOff2 = static_cast<int>(
    5574         791 :                 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
    5575             : 
    5576         791 :             if (nChunkYOff + nFullResYChunk == nHeight)
    5577         784 :                 nDstYOff2 = nDstHeight;
    5578             : #if DEBUG_VERBOSE
    5579             :             CPLDebug("GDAL",
    5580             :                      "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
    5581             :                      nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
    5582             :                      nDstWidth, nDstYOff2 - nDstYOff);
    5583             : #endif
    5584             : 
    5585        1582 :             auto poJob = std::make_unique<OvrJob>();
    5586         791 :             poJob->pfnResampleFn = pfnResampleFn;
    5587         791 :             poJob->bUseGenericResampleFn = bUseGenericResampleFn;
    5588         791 :             poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
    5589         791 :             poJob->args.nOvrXSize = poDstBand->GetXSize();
    5590         791 :             poJob->args.nOvrYSize = poDstBand->GetYSize();
    5591        1582 :             const char *pszNBITS = poDstBand->GetMetadataItem(
    5592         791 :                 GDALMD_NBITS, GDAL_MDD_IMAGE_STRUCTURE);
    5593         791 :             poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    5594         791 :             poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    5595         791 :             poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    5596         791 :             poJob->args.eWrkDataType = eWrkDataType;
    5597         791 :             poJob->pChunk = pChunkRaw;
    5598         791 :             poJob->args.pabyChunkNodataMask = pabyChunkNodataMaskRaw;
    5599         791 :             poJob->nSrcWidth = nWidth;
    5600         791 :             poJob->nSrcHeight = nHeight;
    5601         791 :             poJob->args.nChunkXOff = 0;
    5602         791 :             poJob->args.nChunkXSize = nWidth;
    5603         791 :             poJob->args.nChunkYOff = nChunkYOffQueried;
    5604         791 :             poJob->args.nChunkYSize = nChunkYSizeQueried;
    5605         791 :             poJob->nDstWidth = nDstWidth;
    5606         791 :             poJob->args.nDstXOff = 0;
    5607         791 :             poJob->args.nDstXOff2 = nDstWidth;
    5608         791 :             poJob->args.nDstYOff = nDstYOff;
    5609         791 :             poJob->args.nDstYOff2 = nDstYOff2;
    5610         791 :             poJob->poDstBand = poDstBand;
    5611         791 :             poJob->args.pszResampling = pszResampling;
    5612         791 :             poJob->args.bHasNoData = bHasNoData;
    5613         791 :             poJob->args.dfNoDataValue = dfNoDataValue;
    5614         791 :             poJob->args.poColorTable = poColorTable;
    5615         791 :             poJob->args.eSrcDataType = eSrcDataType;
    5616         791 :             poJob->args.bPropagateNoData = bPropagateNoData;
    5617             : 
    5618         791 :             if (poJobQueue)
    5619             :             {
    5620           0 :                 poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
    5621           0 :                 poJob->SetSrcBufferHolder(oSrcBufferHolder);
    5622           0 :                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    5623           0 :                 jobList.emplace_back(std::move(poJob));
    5624             :             }
    5625             :             else
    5626             :             {
    5627         791 :                 JobResampleFunc(poJob.get());
    5628         791 :                 eErr = poJob->eErr;
    5629         791 :                 if (eErr == CE_None)
    5630             :                 {
    5631         791 :                     eErr = WriteJobData(poJob.get());
    5632             :                 }
    5633             :             }
    5634             :         }
    5635             :     }
    5636             : 
    5637         685 :     VSIFree(pChunk);
    5638         685 :     VSIFree(pabyChunkNodataMask);
    5639             : 
    5640             :     // Wait for all pending jobs to complete
    5641         685 :     while (!jobList.empty())
    5642             :     {
    5643           0 :         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5644           0 :         if (l_eErr != CE_None && eErr == CE_None)
    5645           0 :             eErr = l_eErr;
    5646             :     }
    5647             : 
    5648             :     /* -------------------------------------------------------------------- */
    5649             :     /*      Renormalized overview mean / stddev if needed.                  */
    5650             :     /* -------------------------------------------------------------------- */
    5651         685 :     if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
    5652             :     {
    5653           0 :         GDALOverviewMagnitudeCorrection(
    5654             :             poSrcBand, nOverviewCount,
    5655             :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
    5656             :             GDALDummyProgress, nullptr);
    5657             :     }
    5658             : 
    5659             :     /* -------------------------------------------------------------------- */
    5660             :     /*      It can be important to flush out data to overviews.             */
    5661             :     /* -------------------------------------------------------------------- */
    5662        1469 :     for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
    5663             :          ++iOverview)
    5664             :     {
    5665         784 :         eErr = papoOvrBands[iOverview]->FlushCache(false);
    5666             :     }
    5667             : 
    5668         685 :     if (eErr == CE_None)
    5669         685 :         pfnProgress(1.0, nullptr, pProgressData);
    5670             : 
    5671         685 :     return eErr;
    5672             : }
    5673             : 
    5674             : /************************************************************************/
    5675             : /*                  GDALRegenerateOverviewsMultiBand()                  */
    5676             : /************************************************************************/
    5677             : 
    5678             : /**
    5679             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5680             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5681             :  *
    5682             :  * This function will generate one or more overview images from a base
    5683             :  * image using the requested downsampling algorithm.  Its primary use
    5684             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5685             :  * can also be used to generate downsampled images in one file from another
    5686             :  * outside the overview architecture.
    5687             :  *
    5688             :  * The output bands need to exist in advance and share the same characteristics
    5689             :  * (type, dimensions)
    5690             :  *
    5691             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5692             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5693             :  *
    5694             :  * It does not support color tables or complex data types.
    5695             :  *
    5696             :  * The pseudo-algorithm used by the function is :
    5697             :  *    for each overview
    5698             :  *       iterate on lines of the source by a step of deltay
    5699             :  *           iterate on columns of the source  by a step of deltax
    5700             :  *               read the source data of size deltax * deltay for all the bands
    5701             :  *               generate the corresponding overview block for all the bands
    5702             :  *
    5703             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5704             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5705             :  * considered as the nodata value and not each value of the triplet
    5706             :  * independently per band.
    5707             :  *
    5708             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    5709             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5710             :  * overview computation.
    5711             :  *
    5712             :  * @param nBands the number of bands, size of papoSrcBands and size of
    5713             :  *               first dimension of papapoOverviewBands
    5714             :  * @param papoSrcBands the list of source bands to downsample
    5715             :  * @param nOverviews the number of downsampled overview levels being generated.
    5716             :  * @param papapoOverviewBands bidimension array of bands. First dimension is
    5717             :  *                            indexed by nBands. Second dimension is indexed by
    5718             :  *                            nOverviews.
    5719             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5720             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5721             :  * @param pfnProgress progress report function.
    5722             :  * @param pProgressData progress function callback data.
    5723             :  * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
    5724             :  *                     key=value pairs, or NULL
    5725             :  *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
    5726             :  *                     options can be specified to express that overviews should
    5727             :  *                     be regenerated only in the specified subset of the source
    5728             :  *                     dataset.
    5729             :  * @return CE_None on success or CE_Failure on failure.
    5730             :  */
    5731             : 
    5732         390 : CPLErr GDALRegenerateOverviewsMultiBand(
    5733             :     int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
    5734             :     GDALRasterBand *const *const *papapoOverviewBands,
    5735             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5736             :     void *pProgressData, CSLConstList papszOptions)
    5737             : {
    5738         390 :     CPL_IGNORE_RET_VAL(papszOptions);
    5739             : 
    5740         390 :     if (pfnProgress == nullptr)
    5741          11 :         pfnProgress = GDALDummyProgress;
    5742             : 
    5743         390 :     if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
    5744           3 :         return CE_None;
    5745             : 
    5746             :     // Sanity checks.
    5747         387 :     if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
    5748         193 :         !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
    5749          84 :         !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
    5750          25 :         !EQUAL(pszResampling, "CUBICSPLINE") &&
    5751          24 :         !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
    5752           5 :         !EQUAL(pszResampling, "MODE"))
    5753             :     {
    5754           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    5755             :                  "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
    5756             :                  "not supported",
    5757             :                  pszResampling);
    5758           0 :         return CE_Failure;
    5759             :     }
    5760             : 
    5761         387 :     int nKernelRadius = 0;
    5762             :     GDALResampleFunction pfnResampleFn =
    5763         387 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    5764         387 :     if (pfnResampleFn == nullptr)
    5765           0 :         return CE_Failure;
    5766             : 
    5767         387 :     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
    5768         387 :     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
    5769         387 :     if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
    5770           0 :         return CE_None;
    5771         387 :     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
    5772       66235 :     for (int iBand = 1; iBand < nBands; ++iBand)
    5773             :     {
    5774      131696 :         if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
    5775       65848 :             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
    5776             :         {
    5777           0 :             CPLError(
    5778             :                 CE_Failure, CPLE_NotSupported,
    5779             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5780             :                 "have the same dimensions");
    5781           0 :             return CE_Failure;
    5782             :         }
    5783       65848 :         if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
    5784             :         {
    5785           0 :             CPLError(
    5786             :                 CE_Failure, CPLE_NotSupported,
    5787             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5788             :                 "have the same data type");
    5789           0 :             return CE_Failure;
    5790             :         }
    5791             :     }
    5792             : 
    5793        1030 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5794             :     {
    5795         643 :         const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
    5796         643 :         const int nDstWidth = poOvrFirstBand->GetXSize();
    5797         643 :         const int nDstHeight = poOvrFirstBand->GetYSize();
    5798       66751 :         for (int iBand = 1; iBand < nBands; ++iBand)
    5799             :         {
    5800       66108 :             const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
    5801      132216 :             if (poOvrBand->GetXSize() != nDstWidth ||
    5802       66108 :                 poOvrBand->GetYSize() != nDstHeight)
    5803             :             {
    5804           0 :                 CPLError(
    5805             :                     CE_Failure, CPLE_NotSupported,
    5806             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5807             :                     "of the same level must have the same dimensions");
    5808           0 :                 return CE_Failure;
    5809             :             }
    5810       66108 :             if (poOvrBand->GetRasterDataType() != eDataType)
    5811             :             {
    5812           0 :                 CPLError(
    5813             :                     CE_Failure, CPLE_NotSupported,
    5814             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5815             :                     "must have the same data type as the source bands");
    5816           0 :                 return CE_Failure;
    5817             :             }
    5818             :         }
    5819             :     }
    5820             : 
    5821             :     // First pass to compute the total number of pixels to write.
    5822         387 :     double dfTotalPixelCount = 0;
    5823         387 :     const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
    5824         387 :     const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
    5825         387 :     const int nSrcXSize = atoi(CSLFetchNameValueDef(
    5826             :         papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
    5827         387 :     const int nSrcYSize = atoi(CSLFetchNameValueDef(
    5828             :         papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
    5829        1030 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5830             :     {
    5831         643 :         dfTotalPixelCount +=
    5832        1286 :             static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5833         643 :             papapoOverviewBands[0][iOverview]->GetXSize() *
    5834        1286 :             static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5835         643 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5836             :     }
    5837             : 
    5838             :     const GDALDataType eWrkDataType =
    5839         387 :         GDALGetOvrWorkDataType(pszResampling, eDataType);
    5840             :     const int nWrkDataTypeSize =
    5841         387 :         std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
    5842             : 
    5843         387 :     const bool bIsMask = papoSrcBands[0]->IsMaskBand();
    5844             : 
    5845             :     // If we have a nodata mask and we are doing something more complicated
    5846             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5847             :     const bool bUseNoDataMask =
    5848         574 :         !STARTS_WITH_CI(pszResampling, "NEAR") &&
    5849         187 :         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
    5850             : 
    5851         774 :     std::vector<bool> abHasNoData(nBands);
    5852         774 :     std::vector<double> adfNoDataValue(nBands);
    5853             : 
    5854       66622 :     for (int iBand = 0; iBand < nBands; ++iBand)
    5855             :     {
    5856       66235 :         int nHasNoData = 0;
    5857      132470 :         adfNoDataValue[iBand] =
    5858       66235 :             papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
    5859       66235 :         abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
    5860             :     }
    5861             : 
    5862         774 :     std::string osDetailMessage;
    5863         440 :     if (bUseNoDataMask &&
    5864          53 :         papoSrcBands[0]->HasConflictingMaskSources(&osDetailMessage, false))
    5865             :     {
    5866           9 :         CPLError(CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
    5867          18 :                  abHasNoData[0]
    5868             :                      ? "Only the nodata value will be taken into account."
    5869           9 :                      : "Only the first listed one will be taken into account.");
    5870             :     }
    5871             : 
    5872             :     const bool bPropagateNoData =
    5873         387 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5874             : 
    5875         387 :     const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
    5876             :                                            /* bDefaultToAllCPUs=*/false);
    5877             :     auto poThreadPool =
    5878         387 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5879             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5880         774 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5881             : 
    5882             :     // Only configurable for debug / testing
    5883         387 :     const GIntBig nChunkMaxSize = []() -> GIntBig
    5884             :     {
    5885             :         const char *pszVal =
    5886         387 :             CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
    5887         387 :         if (pszVal)
    5888             :         {
    5889          15 :             GIntBig nRet = 0;
    5890          15 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5891          15 :             return std::max<GIntBig>(100, nRet);
    5892             :         }
    5893         372 :         return 10 * 1024 * 1024;
    5894         387 :     }();
    5895             : 
    5896             :     // Only configurable for debug / testing
    5897         387 :     const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
    5898             :     {
    5899         387 :         const char *pszVal = CPLGetConfigOption(
    5900             :             "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
    5901         387 :         if (pszVal)
    5902             :         {
    5903          14 :             GIntBig nRet = 0;
    5904          14 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5905          14 :             return std::max<GIntBig>(100, nRet);
    5906             :         }
    5907         373 :         const auto nUsableRAM = CPLGetUsablePhysicalRAM();
    5908         373 :         if (nUsableRAM > 0)
    5909         373 :             return nUsableRAM / 10;
    5910             :         // Select a value to be able to at least downsample by 2 for a RGB
    5911             :         // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
    5912           0 :         return 100 * 1024 * 1024;
    5913         387 :     }();
    5914             : 
    5915             :     // Second pass to do the real job.
    5916         387 :     double dfCurPixelCount = 0;
    5917         387 :     CPLErr eErr = CE_None;
    5918        1024 :     for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
    5919             :          ++iOverview)
    5920             :     {
    5921         642 :         int iSrcOverview = -1;  // -1 means the source bands.
    5922             : 
    5923             :         const int nDstTotalWidth =
    5924         642 :             papapoOverviewBands[0][iOverview]->GetXSize();
    5925             :         const int nDstTotalHeight =
    5926         642 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5927             : 
    5928             :         // Compute the coordinates of the target region to refresh
    5929         642 :         constexpr double EPS = 1e-8;
    5930         642 :         const int nDstXOffStart = static_cast<int>(
    5931         642 :             static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
    5932             :             EPS);
    5933             :         const int nDstXOffEnd =
    5934        1284 :             std::min(static_cast<int>(
    5935         642 :                          std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
    5936         642 :                                        nToplevelSrcWidth * nDstTotalWidth -
    5937             :                                    EPS)),
    5938         642 :                      nDstTotalWidth);
    5939         642 :         const int nDstWidth = nDstXOffEnd - nDstXOffStart;
    5940         642 :         const int nDstYOffStart =
    5941         642 :             static_cast<int>(static_cast<double>(nSrcYOff) /
    5942         642 :                                  nToplevelSrcHeight * nDstTotalHeight +
    5943             :                              EPS);
    5944             :         const int nDstYOffEnd =
    5945        1284 :             std::min(static_cast<int>(
    5946         642 :                          std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
    5947         642 :                                        nToplevelSrcHeight * nDstTotalHeight -
    5948             :                                    EPS)),
    5949         642 :                      nDstTotalHeight);
    5950         642 :         const int nDstHeight = nDstYOffEnd - nDstYOffStart;
    5951             : 
    5952             :         // Try to use previous level of overview as the source to compute
    5953             :         // the next level.
    5954         642 :         int nSrcWidth = nToplevelSrcWidth;
    5955         642 :         int nSrcHeight = nToplevelSrcHeight;
    5956         897 :         if (iOverview > 0 &&
    5957         255 :             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
    5958             :         {
    5959         247 :             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
    5960         247 :             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
    5961         247 :             iSrcOverview = iOverview - 1;
    5962             :         }
    5963             : 
    5964         642 :         const double dfXRatioDstToSrc =
    5965         642 :             static_cast<double>(nSrcWidth) / nDstTotalWidth;
    5966         642 :         const double dfYRatioDstToSrc =
    5967         642 :             static_cast<double>(nSrcHeight) / nDstTotalHeight;
    5968             : 
    5969             :         const int nOvrFactor =
    5970        1926 :             std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    5971         642 :                                  static_cast<int>(0.5 + dfYRatioDstToSrc)));
    5972             : 
    5973         642 :         int nDstChunkXSize = 0;
    5974         642 :         int nDstChunkYSize = 0;
    5975         642 :         papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
    5976             :                                                         &nDstChunkYSize);
    5977             : 
    5978         642 :         constexpr int PIXEL_MARGIN = 2;
    5979             :         // Try to extend the chunk size so that the memory needed to acquire
    5980             :         // source pixels goes up to 10 MB.
    5981             :         // This can help for drivers that support multi-threaded reading
    5982         642 :         const int nFullResYChunk = static_cast<int>(std::min<double>(
    5983         642 :             nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
    5984         642 :         const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
    5985        1284 :             nSrcHeight,
    5986        1284 :             nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5987         642 :                                  nKernelRadius * nOvrFactor));
    5988         876 :         while (nDstChunkXSize < nDstWidth)
    5989             :         {
    5990         254 :             constexpr int INCREASE_FACTOR = 2;
    5991             : 
    5992         254 :             const int nFullResXChunk = static_cast<int>(std::min<double>(
    5993         508 :                 nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
    5994         254 :                                               dfXRatioDstToSrc));
    5995             : 
    5996             :             const int nFullResXChunkQueried =
    5997         254 :                 static_cast<int>(std::min<int64_t>(
    5998         508 :                     nSrcWidth,
    5999         508 :                     nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    6000         254 :                                          nKernelRadius * nOvrFactor));
    6001             : 
    6002         254 :             if (nBands > nChunkMaxSize / nFullResXChunkQueried /
    6003         254 :                              nFullResYChunkQueried / nWrkDataTypeSize)
    6004             :             {
    6005          20 :                 break;
    6006             :             }
    6007             : 
    6008         234 :             nDstChunkXSize *= INCREASE_FACTOR;
    6009             :         }
    6010         642 :         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
    6011             : 
    6012         642 :         const int nFullResXChunk = static_cast<int>(std::min<double>(
    6013         642 :             nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
    6014         642 :         const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
    6015        1284 :             nSrcWidth,
    6016        1284 :             nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    6017         642 :                                  nKernelRadius * nOvrFactor));
    6018             : 
    6019             :         // Make sure that the RAM requirements to acquire the source data does
    6020             :         // not exceed nChunkMaxSizeForTempFile
    6021             :         // If so, reduce the destination chunk size, generate overviews in a
    6022             :         // temporary dataset, and copy that temporary dataset over the target
    6023             :         // overview bands (to avoid issues with lossy compression)
    6024             :         const bool bOverflowFullResXChunkYChunkQueried =
    6025         642 :             nBands > std::numeric_limits<int64_t>::max() /
    6026         642 :                          nFullResXChunkQueried / nFullResYChunkQueried /
    6027         642 :                          nWrkDataTypeSize;
    6028             : 
    6029         642 :         const auto nMemRequirement =
    6030             :             bOverflowFullResXChunkYChunkQueried
    6031         642 :                 ? 0
    6032         638 :                 : static_cast<GIntBig>(nFullResXChunkQueried) *
    6033         638 :                       nFullResYChunkQueried * nBands * nWrkDataTypeSize;
    6034             :         // Use a temporary dataset with a smaller destination chunk size
    6035         642 :         const auto nOverShootFactor =
    6036             :             nMemRequirement / nChunkMaxSizeForTempFile;
    6037             : 
    6038         642 :         constexpr int MIN_OVERSHOOT_FACTOR = 4;
    6039             :         const auto nSqrtOverShootFactor = std::max<GIntBig>(
    6040        1284 :             MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
    6041         642 :                                       static_cast<double>(nOverShootFactor)))));
    6042         642 :         constexpr int DEFAULT_CHUNK_SIZE = 256;
    6043         642 :         constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
    6044             :         const int nReducedDstChunkXSize =
    6045             :             bOverflowFullResXChunkYChunkQueried
    6046        1280 :                 ? DEFAULT_CHUNK_SIZE
    6047        1280 :                 : std::max(1, static_cast<int>(nDstChunkXSize /
    6048        1280 :                                                nSqrtOverShootFactor) &
    6049         638 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    6050             :         const int nReducedDstChunkYSize =
    6051             :             bOverflowFullResXChunkYChunkQueried
    6052        1280 :                 ? DEFAULT_CHUNK_SIZE
    6053        1280 :                 : std::max(1, static_cast<int>(nDstChunkYSize /
    6054        1280 :                                                nSqrtOverShootFactor) &
    6055         638 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    6056             : 
    6057         642 :         if (bOverflowFullResXChunkYChunkQueried ||
    6058             :             nMemRequirement > nChunkMaxSizeForTempFile)
    6059             :         {
    6060             :             const auto nDTSize =
    6061          43 :                 std::max(1, GDALGetDataTypeSizeBytes(eDataType));
    6062             :             const bool bTmpDSMemRequirementOverflow =
    6063          43 :                 nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
    6064          43 :                              nDstHeight / nDTSize;
    6065          43 :             const auto nTmpDSMemRequirement =
    6066             :                 bTmpDSMemRequirementOverflow
    6067          43 :                     ? 0
    6068          41 :                     : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
    6069          41 :                           nDTSize;
    6070             : 
    6071             :             // make sure that one band buffer doesn't overflow size_t
    6072             :             const bool bChunkSizeOverflow =
    6073          43 :                 static_cast<size_t>(nDTSize) >
    6074          43 :                 std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
    6075          43 :             const size_t nChunkSize =
    6076             :                 bChunkSizeOverflow
    6077          43 :                     ? 0
    6078          41 :                     : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
    6079             : 
    6080             :             const auto CreateVRT =
    6081          41 :                 [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
    6082             :                  pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
    6083             :                  iSrcOverview, &abHasNoData,
    6084      393585 :                  &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
    6085             :             {
    6086             :                 auto poVRTDS = std::make_unique<VRTDataset>(
    6087          41 :                     nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
    6088          41 :                     nVRTBlockYSize);
    6089             : 
    6090       65620 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    6091             :                 {
    6092      131158 :                     auto poVRTSrc = std::make_unique<VRTSimpleSource>();
    6093       65579 :                     poVRTSrc->SetResampling(pszResampling);
    6094       65579 :                     poVRTDS->AddBand(eWrkDataType);
    6095             :                     auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
    6096       65579 :                         poVRTDS->GetRasterBand(iBand + 1));
    6097             : 
    6098       65579 :                     auto poSrcBand = papoSrcBands[iBand];
    6099       65579 :                     if (iSrcOverview != -1)
    6100          24 :                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
    6101       65579 :                     poVRTBand->ConfigureSource(
    6102             :                         poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
    6103             :                         nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
    6104             :                     // Add the source to the band
    6105       65579 :                     poVRTBand->AddSource(poVRTSrc.release());
    6106       65579 :                     if (abHasNoData[iBand])
    6107           3 :                         poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
    6108             :                 }
    6109             : 
    6110          42 :                 if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
    6111           1 :                     poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
    6112             :                 {
    6113             :                     VRTSourcedRasterBand *poMaskVRTBand =
    6114           1 :                         cpl::down_cast<VRTSourcedRasterBand *>(
    6115           1 :                             poVRTDS->GetRasterBand(1)->GetMaskBand());
    6116           1 :                     auto poSrcBand = papoSrcBands[0];
    6117           1 :                     if (iSrcOverview != -1)
    6118           0 :                         poSrcBand = papapoOverviewBands[0][iSrcOverview];
    6119           1 :                     poMaskVRTBand->AddMaskBandSource(
    6120           1 :                         poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
    6121             :                         0, 0, nDstTotalWidth, nDstTotalHeight);
    6122             :                 }
    6123             : 
    6124          41 :                 return poVRTDS;
    6125          43 :             };
    6126             : 
    6127             :             // If the overview accommodates chunking, do so and recurse
    6128             :             // to avoid generating full size temporary files
    6129          43 :             if (!bOverflowFullResXChunkYChunkQueried &&
    6130          39 :                 !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
    6131          39 :                 (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
    6132             :             {
    6133             :                 // Create a VRT with the smaller chunk to do the scaling
    6134             :                 auto poVRTDS =
    6135          13 :                     CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    6136             : 
    6137          13 :                 std::vector<GDALRasterBand *> apoVRTBand(nBands);
    6138          13 :                 std::vector<GDALRasterBand *> apoDstBand(nBands);
    6139       65560 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    6140             :                 {
    6141       65547 :                     apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
    6142       65547 :                     apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
    6143             :                 }
    6144             : 
    6145             :                 // Use a flag to avoid reading from the overview being built
    6146             :                 GDALRasterIOExtraArg sExtraArg;
    6147          13 :                 INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    6148          13 :                 if (iSrcOverview == -1)
    6149          13 :                     sExtraArg.bUseOnlyThisScale = true;
    6150             : 
    6151             :                 // A single band buffer for data transfer to the overview
    6152          13 :                 std::vector<GByte> abyChunk;
    6153             :                 try
    6154             :                 {
    6155          13 :                     abyChunk.resize(nChunkSize);
    6156             :                 }
    6157           0 :                 catch (const std::exception &)
    6158             :                 {
    6159           0 :                     CPLError(CE_Failure, CPLE_OutOfMemory,
    6160             :                              "Out of memory allocating temporary buffer");
    6161           0 :                     return CE_Failure;
    6162             :                 }
    6163             : 
    6164             :                 // Loop over output height, in chunks
    6165          13 :                 for (int nDstYOff = nDstYOffStart;
    6166          38 :                      nDstYOff < nDstYOffEnd && eErr == CE_None;
    6167             :                      /* */)
    6168             :                 {
    6169             :                     const int nDstYCount =
    6170          25 :                         std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    6171             :                     // Loop over output width, in output chunks
    6172          25 :                     for (int nDstXOff = nDstXOffStart;
    6173          74 :                          nDstXOff < nDstXOffEnd && eErr == CE_None;
    6174             :                          /* */)
    6175             :                     {
    6176             :                         const int nDstXCount =
    6177          49 :                             std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    6178             :                         // Read and transfer the chunk to the overview
    6179          98 :                         for (int iBand = 0; iBand < nBands && eErr == CE_None;
    6180             :                              ++iBand)
    6181             :                         {
    6182          98 :                             eErr = apoVRTBand[iBand]->RasterIO(
    6183             :                                 GF_Read, nDstXOff, nDstYOff, nDstXCount,
    6184          49 :                                 nDstYCount, abyChunk.data(), nDstXCount,
    6185             :                                 nDstYCount, eDataType, 0, 0, &sExtraArg);
    6186          49 :                             if (eErr == CE_None)
    6187             :                             {
    6188          96 :                                 eErr = apoDstBand[iBand]->RasterIO(
    6189             :                                     GF_Write, nDstXOff, nDstYOff, nDstXCount,
    6190          48 :                                     nDstYCount, abyChunk.data(), nDstXCount,
    6191             :                                     nDstYCount, eDataType, 0, 0, nullptr);
    6192             :                             }
    6193             :                         }
    6194             : 
    6195          49 :                         dfCurPixelCount +=
    6196          49 :                             static_cast<double>(nDstXCount) * nDstYCount;
    6197             : 
    6198          49 :                         nDstXOff += nDstXCount;
    6199             :                     }  // width
    6200             : 
    6201          25 :                     if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
    6202             :                                      nullptr, pProgressData))
    6203             :                     {
    6204           0 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    6205             :                                  "User terminated");
    6206           0 :                         eErr = CE_Failure;
    6207             :                     }
    6208             : 
    6209          25 :                     nDstYOff += nDstYCount;
    6210             :                 }  // height
    6211             : 
    6212          13 :                 if (CE_None != eErr)
    6213             :                 {
    6214           1 :                     CPLError(CE_Failure, CPLE_AppDefined,
    6215             :                              "Error while writing overview");
    6216           1 :                     return CE_Failure;
    6217             :                 }
    6218             : 
    6219          12 :                 pfnProgress(1.0, nullptr, pProgressData);
    6220             :                 // Flush the overviews we just generated
    6221          24 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    6222          12 :                     apoDstBand[iBand]->FlushCache(false);
    6223             : 
    6224          12 :                 continue;  // Next overview
    6225             :             }  // chunking via temporary dataset
    6226             : 
    6227           0 :             std::unique_ptr<GDALDataset> poTmpDS;
    6228             :             // Config option mostly/only for autotest purposes
    6229             :             const char *pszGDAL_OVR_TEMP_DRIVER =
    6230          30 :                 CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
    6231          30 :             if ((!bTmpDSMemRequirementOverflow &&
    6232           4 :                  nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
    6233           4 :                  !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
    6234          26 :                 EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
    6235             :             {
    6236          10 :                 auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
    6237          10 :                 if (!poTmpDrv)
    6238             :                 {
    6239           0 :                     eErr = CE_Failure;
    6240           0 :                     break;
    6241             :                 }
    6242          10 :                 poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
    6243             :                                                nDstTotalHeight, nBands,
    6244          10 :                                                eDataType, nullptr));
    6245             :             }
    6246             :             else
    6247             :             {
    6248             :                 // Create a temporary file for the overview
    6249             :                 auto poTmpDrv =
    6250          20 :                     GetGDALDriverManager()->GetDriverByName("GTiff");
    6251          20 :                 if (!poTmpDrv)
    6252             :                 {
    6253           0 :                     eErr = CE_Failure;
    6254           0 :                     break;
    6255             :                 }
    6256          40 :                 std::string osTmpFilename;
    6257          20 :                 auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
    6258          20 :                 if (poDstDS)
    6259             :                 {
    6260          20 :                     osTmpFilename = poDstDS->GetDescription();
    6261             :                     VSIStatBufL sStatBuf;
    6262          20 :                     if (!osTmpFilename.empty() &&
    6263           0 :                         VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
    6264           0 :                         osTmpFilename += "_tmp_ovr.tif";
    6265             :                 }
    6266          20 :                 if (osTmpFilename.empty())
    6267             :                 {
    6268          20 :                     osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
    6269          20 :                     osTmpFilename += ".tif";
    6270             :                 }
    6271          20 :                 CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
    6272             :                          osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
    6273          40 :                 CPLStringList aosCO;
    6274          20 :                 if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
    6275          20 :                           (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
    6276             :                 {
    6277          14 :                     aosCO.SetNameValue("TILED", "YES");
    6278             :                     aosCO.SetNameValue("BLOCKXSIZE",
    6279          14 :                                        CPLSPrintf("%d", nReducedDstChunkXSize));
    6280             :                     aosCO.SetNameValue("BLOCKYSIZE",
    6281          14 :                                        CPLSPrintf("%d", nReducedDstChunkYSize));
    6282             :                 }
    6283          20 :                 if (const char *pszCOList =
    6284          20 :                         poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
    6285             :                 {
    6286             :                     aosCO.SetNameValue(
    6287          20 :                         "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
    6288             :                 }
    6289          20 :                 poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
    6290             :                                                nDstHeight, nBands, eDataType,
    6291          20 :                                                aosCO.List()));
    6292          20 :                 if (poTmpDS)
    6293             :                 {
    6294          18 :                     poTmpDS->MarkSuppressOnClose();
    6295          18 :                     VSIUnlink(osTmpFilename.c_str());
    6296             :                 }
    6297             :             }
    6298          30 :             if (!poTmpDS)
    6299             :             {
    6300           2 :                 eErr = CE_Failure;
    6301           2 :                 break;
    6302             :             }
    6303             : 
    6304             :             // Create a full size VRT to do the resampling without edge effects
    6305             :             auto poVRTDS =
    6306          28 :                 CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    6307             : 
    6308             :             // Allocate a band buffer with the overview chunk size
    6309             :             std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
    6310             :                 VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
    6311          28 :                                     nDstChunkYSize));
    6312          28 :             if (pDstBuffer == nullptr)
    6313             :             {
    6314           0 :                 eErr = CE_Failure;
    6315           0 :                 break;
    6316             :             }
    6317             : 
    6318             :             // Use a flag to avoid reading the overview being built
    6319             :             GDALRasterIOExtraArg sExtraArg;
    6320          28 :             INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    6321          28 :             if (iSrcOverview == -1)
    6322           4 :                 sExtraArg.bUseOnlyThisScale = true;
    6323             : 
    6324             :             // Scale and copy data from the VRT to the temp file
    6325          28 :             for (int nDstYOff = nDstYOffStart;
    6326         914 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    6327             :                  /* */)
    6328             :             {
    6329             :                 const int nDstYCount =
    6330         886 :                     std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
    6331         886 :                 for (int nDstXOff = nDstXOffStart;
    6332      201218 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    6333             :                      /* */)
    6334             :                 {
    6335             :                     const int nDstXCount =
    6336      200332 :                         std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
    6337      400668 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    6338             :                          ++iBand)
    6339             :                     {
    6340      200336 :                         auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
    6341      200336 :                         eErr = poSrcBand->RasterIO(
    6342             :                             GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
    6343             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    6344             :                             eWrkDataType, 0, 0, &sExtraArg);
    6345      200336 :                         if (eErr == CE_None)
    6346             :                         {
    6347             :                             // Write to the temporary dataset, shifted
    6348      200334 :                             auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
    6349      200334 :                             eErr = poOvrBand->RasterIO(
    6350             :                                 GF_Write, nDstXOff - nDstXOffStart,
    6351             :                                 nDstYOff - nDstYOffStart, nDstXCount,
    6352             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    6353             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    6354             :                         }
    6355             :                     }
    6356      200332 :                     nDstXOff += nDstXCount;
    6357             :                 }
    6358         886 :                 nDstYOff += nDstYCount;
    6359             :             }
    6360             : 
    6361             :             // Copy from the temporary to the overview
    6362          28 :             for (int nDstYOff = nDstYOffStart;
    6363          54 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    6364             :                  /* */)
    6365             :             {
    6366             :                 const int nDstYCount =
    6367          26 :                     std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    6368          26 :                 for (int nDstXOff = nDstXOffStart;
    6369          52 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    6370             :                      /* */)
    6371             :                 {
    6372             :                     const int nDstXCount =
    6373          26 :                         std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    6374          56 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    6375             :                          ++iBand)
    6376             :                     {
    6377          30 :                         auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
    6378          30 :                         eErr = poSrcBand->RasterIO(
    6379             :                             GF_Read, nDstXOff - nDstXOffStart,
    6380             :                             nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
    6381             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    6382             :                             eWrkDataType, 0, 0, nullptr);
    6383          30 :                         if (eErr == CE_None)
    6384             :                         {
    6385             :                             // Write to the destination overview bands
    6386          30 :                             auto poOvrBand =
    6387          30 :                                 papapoOverviewBands[iBand][iOverview];
    6388          30 :                             eErr = poOvrBand->RasterIO(
    6389             :                                 GF_Write, nDstXOff, nDstYOff, nDstXCount,
    6390             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    6391             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    6392             :                         }
    6393             :                     }
    6394          26 :                     nDstXOff += nDstXCount;
    6395             :                 }
    6396          26 :                 nDstYOff += nDstYCount;
    6397             :             }
    6398             : 
    6399          28 :             if (eErr != CE_None)
    6400             :             {
    6401           2 :                 CPLError(CE_Failure, CPLE_AppDefined,
    6402             :                          "Failed to write overview %d", iOverview);
    6403           2 :                 return eErr;
    6404             :             }
    6405             : 
    6406             :             // Flush the data to overviews.
    6407          56 :             for (int iBand = 0; iBand < nBands; ++iBand)
    6408          30 :                 papapoOverviewBands[iBand][iOverview]->FlushCache(false);
    6409             : 
    6410          26 :             continue;
    6411             :         }
    6412             : 
    6413             :         // Structure describing a resampling job
    6414             :         struct OvrJob
    6415             :         {
    6416             :             // Buffers to free when job is finished
    6417             :             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
    6418             :             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
    6419             :             std::unique_ptr<PointerHolder> oDstBufferHolder{};
    6420             : 
    6421             :             GDALRasterBand *poDstBand = nullptr;
    6422             : 
    6423             :             // Input parameters of pfnResampleFn
    6424             :             GDALResampleFunction pfnResampleFn = nullptr;
    6425             :             GDALOverviewResampleArgs args{};
    6426             :             const void *pChunk = nullptr;
    6427             : 
    6428             :             // Output values of resampling function
    6429             :             CPLErr eErr = CE_Failure;
    6430             :             void *pDstBuffer = nullptr;
    6431             :             GDALDataType eDstBufferDataType = GDT_Unknown;
    6432             : 
    6433        3296 :             void NotifyFinished()
    6434             :             {
    6435        6592 :                 std::lock_guard guard(mutex);
    6436        3296 :                 bFinished = true;
    6437        3296 :                 cv.notify_one();
    6438        3296 :             }
    6439             : 
    6440           2 :             bool IsFinished()
    6441             :             {
    6442           2 :                 std::lock_guard guard(mutex);
    6443           4 :                 return bFinished;
    6444             :             }
    6445             : 
    6446          14 :             void WaitFinished()
    6447             :             {
    6448          28 :                 std::unique_lock oGuard(mutex);
    6449          21 :                 while (!bFinished)
    6450             :                 {
    6451           7 :                     cv.wait(oGuard);
    6452             :                 }
    6453          14 :             }
    6454             : 
    6455             :           private:
    6456             :             // Synchronization
    6457             :             bool bFinished = false;
    6458             :             std::mutex mutex{};
    6459             :             std::condition_variable cv{};
    6460             :         };
    6461             : 
    6462             :         // Thread function to resample
    6463        3296 :         const auto JobResampleFunc = [](void *pData)
    6464             :         {
    6465        3296 :             OvrJob *poJob = static_cast<OvrJob *>(pData);
    6466             : 
    6467        3296 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    6468             :                                                &(poJob->pDstBuffer),
    6469             :                                                &(poJob->eDstBufferDataType));
    6470             : 
    6471        3296 :             auto pDstBuffer = poJob->pDstBuffer;
    6472             :             poJob->oDstBufferHolder =
    6473        3296 :                 std::make_unique<PointerHolder>(pDstBuffer);
    6474             : 
    6475        3296 :             poJob->NotifyFinished();
    6476        3296 :         };
    6477             : 
    6478             :         // Function to write resample data to target band
    6479        3296 :         const auto WriteJobData = [](const OvrJob *poJob)
    6480             :         {
    6481        6592 :             return poJob->poDstBand->RasterIO(
    6482        3296 :                 GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
    6483        3296 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    6484        3296 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    6485        3296 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    6486        3296 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    6487        3296 :                 poJob->eDstBufferDataType, 0, 0, nullptr);
    6488             :         };
    6489             : 
    6490             :         // Wait for completion of oldest job and serialize it
    6491             :         const auto WaitAndFinalizeOldestJob =
    6492          14 :             [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    6493             :         {
    6494          14 :             auto poOldestJob = jobList.front().get();
    6495          14 :             poOldestJob->WaitFinished();
    6496          14 :             CPLErr l_eErr = poOldestJob->eErr;
    6497          14 :             if (l_eErr == CE_None)
    6498             :             {
    6499          14 :                 l_eErr = WriteJobData(poOldestJob);
    6500             :             }
    6501             : 
    6502          14 :             jobList.pop_front();
    6503          14 :             return l_eErr;
    6504             :         };
    6505             : 
    6506             :         // Queue of jobs
    6507        1198 :         std::list<std::unique_ptr<OvrJob>> jobList;
    6508             : 
    6509        1198 :         std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
    6510             :         std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
    6511        1198 :             apabyChunkNoDataMask(nBands);
    6512             : 
    6513             :         // Iterate on destination overview, block by block.
    6514         599 :         for (int nDstYOff = nDstYOffStart;
    6515        2105 :              nDstYOff < nDstYOffEnd && eErr == CE_None;
    6516        1506 :              nDstYOff += nDstChunkYSize)
    6517             :         {
    6518             :             int nDstYCount;
    6519        1506 :             if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
    6520        1085 :                 nDstYCount = nDstChunkYSize;
    6521             :             else
    6522         421 :                 nDstYCount = nDstYOffEnd - nDstYOff;
    6523             : 
    6524        1506 :             int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    6525        1506 :             int nChunkYOff2 = static_cast<int>(
    6526        1506 :                 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
    6527        1506 :             if (nChunkYOff2 > nSrcHeight ||
    6528        1506 :                 nDstYOff + nDstYCount == nDstTotalHeight)
    6529         592 :                 nChunkYOff2 = nSrcHeight;
    6530        1506 :             int nYCount = nChunkYOff2 - nChunkYOff;
    6531        1506 :             CPLAssert(nYCount <= nFullResYChunk);
    6532             : 
    6533        1506 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    6534        1506 :             int nChunkYSizeQueried =
    6535        1506 :                 nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    6536        1506 :             if (nChunkYOffQueried < 0)
    6537             :             {
    6538         146 :                 nChunkYSizeQueried += nChunkYOffQueried;
    6539         146 :                 nChunkYOffQueried = 0;
    6540             :             }
    6541        1506 :             if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
    6542         146 :                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
    6543        1506 :             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
    6544             : 
    6545        1506 :             if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
    6546             :                              nullptr, pProgressData))
    6547             :             {
    6548           1 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6549           1 :                 eErr = CE_Failure;
    6550             :             }
    6551             : 
    6552             :             // Iterate on destination overview, block by block.
    6553        1506 :             for (int nDstXOff = nDstXOffStart;
    6554        3053 :                  nDstXOff < nDstXOffEnd && eErr == CE_None;
    6555        1547 :                  nDstXOff += nDstChunkXSize)
    6556             :             {
    6557        1547 :                 int nDstXCount = 0;
    6558        1547 :                 if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
    6559        1528 :                     nDstXCount = nDstChunkXSize;
    6560             :                 else
    6561          19 :                     nDstXCount = nDstXOffEnd - nDstXOff;
    6562             : 
    6563        1547 :                 dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
    6564             : 
    6565        1547 :                 int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    6566        1547 :                 int nChunkXOff2 = static_cast<int>(
    6567        1547 :                     ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    6568        1547 :                 if (nChunkXOff2 > nSrcWidth ||
    6569        1547 :                     nDstXOff + nDstXCount == nDstTotalWidth)
    6570        1470 :                     nChunkXOff2 = nSrcWidth;
    6571        1547 :                 const int nXCount = nChunkXOff2 - nChunkXOff;
    6572        1547 :                 CPLAssert(nXCount <= nFullResXChunk);
    6573             : 
    6574        1547 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    6575        1547 :                 int nChunkXSizeQueried =
    6576        1547 :                     nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    6577        1547 :                 if (nChunkXOffQueried < 0)
    6578             :                 {
    6579         209 :                     nChunkXSizeQueried += nChunkXOffQueried;
    6580         209 :                     nChunkXOffQueried = 0;
    6581             :                 }
    6582        1547 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
    6583         218 :                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
    6584        1547 :                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
    6585             : #if DEBUG_VERBOSE
    6586             :                 CPLDebug("GDAL",
    6587             :                          "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
    6588             :                          nChunkXOffQueried, nChunkYOffQueried,
    6589             :                          nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
    6590             :                          nDstYOff, nDstXCount, nDstYCount);
    6591             : #endif
    6592             : 
    6593             :                 // Avoid accumulating too many tasks and exhaust RAM
    6594             : 
    6595             :                 // Try to complete already finished jobs
    6596        1549 :                 while (eErr == CE_None && !jobList.empty())
    6597             :                 {
    6598           2 :                     auto poOldestJob = jobList.front().get();
    6599           2 :                     if (!poOldestJob->IsFinished())
    6600           0 :                         break;
    6601           2 :                     eErr = poOldestJob->eErr;
    6602           2 :                     if (eErr == CE_None)
    6603             :                     {
    6604           2 :                         eErr = WriteJobData(poOldestJob);
    6605             :                     }
    6606             : 
    6607           2 :                     jobList.pop_front();
    6608             :                 }
    6609             : 
    6610             :                 // And in case we have saturated the number of threads,
    6611             :                 // wait for completion of tasks to go below the threshold.
    6612        3094 :                 while (eErr == CE_None &&
    6613        1547 :                        jobList.size() >= static_cast<size_t>(nThreads))
    6614             :                 {
    6615           0 :                     eErr = WaitAndFinalizeOldestJob(jobList);
    6616             :                 }
    6617             : 
    6618             :                 // Read the source buffers for all the bands.
    6619        4844 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6620             :                 {
    6621             :                     // (Re)allocate buffers if needed
    6622        3297 :                     if (apaChunk[iBand] == nullptr)
    6623             :                     {
    6624        1171 :                         apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
    6625             :                             nFullResXChunkQueried, nFullResYChunkQueried,
    6626             :                             nWrkDataTypeSize));
    6627        1171 :                         if (apaChunk[iBand] == nullptr)
    6628             :                         {
    6629           0 :                             eErr = CE_Failure;
    6630             :                         }
    6631             :                     }
    6632        3632 :                     if (bUseNoDataMask &&
    6633         335 :                         apabyChunkNoDataMask[iBand] == nullptr)
    6634             :                     {
    6635         268 :                         apabyChunkNoDataMask[iBand].reset(
    6636         268 :                             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    6637             :                                 nFullResXChunkQueried, nFullResYChunkQueried)));
    6638         268 :                         if (apabyChunkNoDataMask[iBand] == nullptr)
    6639             :                         {
    6640           0 :                             eErr = CE_Failure;
    6641             :                         }
    6642             :                     }
    6643             : 
    6644        3297 :                     if (eErr == CE_None)
    6645             :                     {
    6646        3297 :                         GDALRasterBand *poSrcBand = nullptr;
    6647        3297 :                         if (iSrcOverview == -1)
    6648        2405 :                             poSrcBand = papoSrcBands[iBand];
    6649             :                         else
    6650         892 :                             poSrcBand =
    6651         892 :                                 papapoOverviewBands[iBand][iSrcOverview];
    6652        3297 :                         eErr = poSrcBand->RasterIO(
    6653             :                             GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6654             :                             nChunkXSizeQueried, nChunkYSizeQueried,
    6655        3297 :                             apaChunk[iBand].get(), nChunkXSizeQueried,
    6656             :                             nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
    6657             : 
    6658        3297 :                         if (bUseNoDataMask && eErr == CE_None)
    6659             :                         {
    6660         335 :                             auto poMaskBand = poSrcBand->IsMaskBand()
    6661         335 :                                                   ? poSrcBand
    6662         253 :                                                   : poSrcBand->GetMaskBand();
    6663         335 :                             eErr = poMaskBand->RasterIO(
    6664             :                                 GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6665             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6666         335 :                                 apabyChunkNoDataMask[iBand].get(),
    6667             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6668             :                                 GDT_UInt8, 0, 0, nullptr);
    6669             :                         }
    6670             :                     }
    6671             :                 }
    6672             : 
    6673             :                 // Compute the resulting overview block.
    6674        4843 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6675             :                 {
    6676        6592 :                     auto poJob = std::make_unique<OvrJob>();
    6677        3296 :                     poJob->pfnResampleFn = pfnResampleFn;
    6678        3296 :                     poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
    6679        6592 :                     poJob->args.eOvrDataType =
    6680        3296 :                         poJob->poDstBand->GetRasterDataType();
    6681        3296 :                     poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
    6682        3296 :                     poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
    6683        3296 :                     const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
    6684        3296 :                         GDALMD_NBITS, GDAL_MDD_IMAGE_STRUCTURE);
    6685        3296 :                     poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    6686        3296 :                     poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    6687        3296 :                     poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    6688        3296 :                     poJob->args.eWrkDataType = eWrkDataType;
    6689        3296 :                     poJob->pChunk = apaChunk[iBand].get();
    6690        3296 :                     poJob->args.pabyChunkNodataMask =
    6691        3296 :                         apabyChunkNoDataMask[iBand].get();
    6692        3296 :                     poJob->args.nChunkXOff = nChunkXOffQueried;
    6693        3296 :                     poJob->args.nChunkXSize = nChunkXSizeQueried;
    6694        3296 :                     poJob->args.nChunkYOff = nChunkYOffQueried;
    6695        3296 :                     poJob->args.nChunkYSize = nChunkYSizeQueried;
    6696        3296 :                     poJob->args.nDstXOff = nDstXOff;
    6697        3296 :                     poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
    6698        3296 :                     poJob->args.nDstYOff = nDstYOff;
    6699        3296 :                     poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
    6700        3296 :                     poJob->args.pszResampling = pszResampling;
    6701        3296 :                     poJob->args.bHasNoData = abHasNoData[iBand];
    6702        3296 :                     poJob->args.dfNoDataValue = adfNoDataValue[iBand];
    6703        3296 :                     poJob->args.eSrcDataType = eDataType;
    6704        3296 :                     poJob->args.bPropagateNoData = bPropagateNoData;
    6705             : 
    6706        3296 :                     if (poJobQueue)
    6707             :                     {
    6708          16 :                         poJob->oSrcMaskBufferHolder =
    6709          32 :                             std::make_unique<PointerHolder>(
    6710          32 :                                 std::move(apabyChunkNoDataMask[iBand]));
    6711             : 
    6712          16 :                         poJob->oSrcBufferHolder =
    6713          32 :                             std::make_unique<PointerHolder>(
    6714          32 :                                 std::move(apaChunk[iBand]));
    6715             : 
    6716          16 :                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    6717          16 :                         jobList.emplace_back(std::move(poJob));
    6718             :                     }
    6719             :                     else
    6720             :                     {
    6721        3280 :                         JobResampleFunc(poJob.get());
    6722        3280 :                         eErr = poJob->eErr;
    6723        3280 :                         if (eErr == CE_None)
    6724             :                         {
    6725        3280 :                             eErr = WriteJobData(poJob.get());
    6726             :                         }
    6727             :                     }
    6728             :                 }
    6729             :             }
    6730             :         }
    6731             : 
    6732             :         // Wait for all pending jobs to complete
    6733         613 :         while (!jobList.empty())
    6734             :         {
    6735          14 :             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    6736          14 :             if (l_eErr != CE_None && eErr == CE_None)
    6737           0 :                 eErr = l_eErr;
    6738             :         }
    6739             : 
    6740             :         // Flush the data to overviews.
    6741        1768 :         for (int iBand = 0; iBand < nBands; ++iBand)
    6742             :         {
    6743        1169 :             if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
    6744             :                 CE_None)
    6745           0 :                 eErr = CE_Failure;
    6746             :         }
    6747             :     }
    6748             : 
    6749         384 :     if (eErr == CE_None)
    6750         380 :         pfnProgress(1.0, nullptr, pProgressData);
    6751             : 
    6752         384 :     return eErr;
    6753             : }
    6754             : 
    6755             : /************************************************************************/
    6756             : /*                  GDALRegenerateOverviewsMultiBand()                  */
    6757             : /************************************************************************/
    6758             : 
    6759             : /**
    6760             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    6761             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    6762             :  *
    6763             :  * This function will generate one or more overview images from a base
    6764             :  * image using the requested downsampling algorithm.  Its primary use
    6765             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    6766             :  * can also be used to generate downsampled images in one file from another
    6767             :  * outside the overview architecture.
    6768             :  *
    6769             :  * The output bands need to exist in advance and share the same characteristics
    6770             :  * (type, dimensions)
    6771             :  *
    6772             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    6773             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    6774             :  *
    6775             :  * It does not support color tables or complex data types.
    6776             :  *
    6777             :  * The pseudo-algorithm used by the function is :
    6778             :  *    for each overview
    6779             :  *       iterate on lines of the source by a step of deltay
    6780             :  *           iterate on columns of the source  by a step of deltax
    6781             :  *               read the source data of size deltax * deltay for all the bands
    6782             :  *               generate the corresponding overview block for all the bands
    6783             :  *
    6784             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    6785             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    6786             :  * considered as the nodata value and not each value of the triplet
    6787             :  * independently per band.
    6788             :  *
    6789             :  * The GDAL_NUM_THREADS configuration option can be set
    6790             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    6791             :  * overview computation.
    6792             :  *
    6793             :  * @param apoSrcBands the list of source bands to downsample
    6794             :  * @param aapoOverviewBands bidimension array of bands. First dimension is
    6795             :  *                          indexed by bands. Second dimension is indexed by
    6796             :  *                          overview levels. All aapoOverviewBands[i] arrays
    6797             :  *                          must have the same size (i.e. same number of
    6798             :  *                          overviews)
    6799             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    6800             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    6801             :  * @param pfnProgress progress report function.
    6802             :  * @param pProgressData progress function callback data.
    6803             :  * @param papszOptions NULL terminated list of options as
    6804             :  *                     key=value pairs, or NULL
    6805             :  *                     The XOFF, YOFF, XSIZE and YSIZE
    6806             :  *                     options can be specified to express that overviews should
    6807             :  *                     be regenerated only in the specified subset of the source
    6808             :  *                     dataset.
    6809             :  * @return CE_None on success or CE_Failure on failure.
    6810             :  * @since 3.10
    6811             :  */
    6812             : 
    6813          19 : CPLErr GDALRegenerateOverviewsMultiBand(
    6814             :     const std::vector<GDALRasterBand *> &apoSrcBands,
    6815             :     const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
    6816             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    6817             :     void *pProgressData, CSLConstList papszOptions)
    6818             : {
    6819          19 :     CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
    6820          29 :     for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
    6821             :     {
    6822          10 :         CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
    6823             :     }
    6824             : 
    6825          19 :     if (aapoOverviewBands.empty())
    6826           0 :         return CE_None;
    6827             : 
    6828          19 :     std::vector<GDALRasterBand **> apapoOverviewBands;
    6829          48 :     for (auto &apoOverviewBands : aapoOverviewBands)
    6830             :     {
    6831             :         auto papoOverviewBands = static_cast<GDALRasterBand **>(
    6832          29 :             CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
    6833          61 :         for (size_t i = 0; i < apoOverviewBands.size(); ++i)
    6834             :         {
    6835          32 :             papoOverviewBands[i] = apoOverviewBands[i];
    6836             :         }
    6837          29 :         apapoOverviewBands.push_back(papoOverviewBands);
    6838             :     }
    6839          38 :     const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
    6840          19 :         static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
    6841          19 :         static_cast<int>(aapoOverviewBands[0].size()),
    6842          19 :         apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
    6843             :         papszOptions);
    6844          48 :     for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
    6845          29 :         CPLFree(papoOverviewBands);
    6846          19 :     return eErr;
    6847             : }
    6848             : 
    6849             : /************************************************************************/
    6850             : /*                        GDALComputeBandStats()                        */
    6851             : /************************************************************************/
    6852             : 
    6853             : /** Undocumented
    6854             :  * @param hSrcBand undocumented.
    6855             :  * @param nSampleStep Step between scanlines used to compute statistics.
    6856             :  *                    When nSampleStep is equal to 1, all scanlines will
    6857             :  *                    be processed.
    6858             :  * @param pdfMean undocumented.
    6859             :  * @param pdfStdDev undocumented.
    6860             :  * @param pfnProgress undocumented.
    6861             :  * @param pProgressData undocumented.
    6862             :  * @return undocumented
    6863             :  */
    6864          18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
    6865             :                                         int nSampleStep, double *pdfMean,
    6866             :                                         double *pdfStdDev,
    6867             :                                         GDALProgressFunc pfnProgress,
    6868             :                                         void *pProgressData)
    6869             : 
    6870             : {
    6871          18 :     VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
    6872             : 
    6873          18 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    6874             : 
    6875          18 :     if (pfnProgress == nullptr)
    6876          18 :         pfnProgress = GDALDummyProgress;
    6877             : 
    6878          18 :     const int nWidth = poSrcBand->GetXSize();
    6879          18 :     const int nHeight = poSrcBand->GetYSize();
    6880             : 
    6881          18 :     if (nSampleStep >= nHeight || nSampleStep < 1)
    6882           5 :         nSampleStep = 1;
    6883             : 
    6884          18 :     GDALDataType eWrkType = GDT_Unknown;
    6885          18 :     float *pafData = nullptr;
    6886          18 :     GDALDataType eType = poSrcBand->GetRasterDataType();
    6887          18 :     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6888          18 :     if (bComplex)
    6889             :     {
    6890             :         pafData = static_cast<float *>(
    6891           0 :             VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6892           0 :         eWrkType = GDT_CFloat32;
    6893             :     }
    6894             :     else
    6895             :     {
    6896             :         pafData =
    6897          18 :             static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6898          18 :         eWrkType = GDT_Float32;
    6899             :     }
    6900             : 
    6901          18 :     if (nWidth == 0 || pafData == nullptr)
    6902             :     {
    6903           0 :         VSIFree(pafData);
    6904           0 :         return CE_Failure;
    6905             :     }
    6906             : 
    6907             :     /* -------------------------------------------------------------------- */
    6908             :     /*      Loop over all sample lines.                                     */
    6909             :     /* -------------------------------------------------------------------- */
    6910          18 :     double dfSum = 0.0;
    6911          18 :     double dfSum2 = 0.0;
    6912          18 :     int iLine = 0;
    6913          18 :     GIntBig nSamples = 0;
    6914             : 
    6915        2143 :     do
    6916             :     {
    6917        2161 :         if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6918             :                          pProgressData))
    6919             :         {
    6920           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6921           0 :             CPLFree(pafData);
    6922           0 :             return CE_Failure;
    6923             :         }
    6924             : 
    6925             :         const CPLErr eErr =
    6926        2161 :             poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
    6927             :                                 1, eWrkType, 0, 0, nullptr);
    6928        2161 :         if (eErr != CE_None)
    6929             :         {
    6930           1 :             CPLFree(pafData);
    6931           1 :             return eErr;
    6932             :         }
    6933             : 
    6934      725208 :         for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6935             :         {
    6936      723048 :             float fValue = 0.0f;
    6937             : 
    6938      723048 :             if (bComplex)
    6939             :             {
    6940             :                 // Compute the magnitude of the complex value.
    6941             :                 fValue =
    6942           0 :                     std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
    6943           0 :                                pafData[static_cast<size_t>(iPixel) * 2 + 1]);
    6944             :             }
    6945             :             else
    6946             :             {
    6947      723048 :                 fValue = pafData[iPixel];
    6948             :             }
    6949             : 
    6950      723048 :             dfSum += static_cast<double>(fValue);
    6951      723048 :             dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
    6952             :         }
    6953             : 
    6954        2160 :         nSamples += nWidth;
    6955        2160 :         iLine += nSampleStep;
    6956        2160 :     } while (iLine < nHeight);
    6957             : 
    6958          17 :     if (!pfnProgress(1.0, nullptr, pProgressData))
    6959             :     {
    6960           0 :         CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6961           0 :         CPLFree(pafData);
    6962           0 :         return CE_Failure;
    6963             :     }
    6964             : 
    6965             :     /* -------------------------------------------------------------------- */
    6966             :     /*      Produce the result values.                                      */
    6967             :     /* -------------------------------------------------------------------- */
    6968          17 :     if (pdfMean != nullptr)
    6969          17 :         *pdfMean = dfSum / nSamples;
    6970             : 
    6971          17 :     if (pdfStdDev != nullptr)
    6972             :     {
    6973          17 :         const double dfMean = dfSum / nSamples;
    6974             : 
    6975          17 :         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
    6976             :     }
    6977             : 
    6978          17 :     CPLFree(pafData);
    6979             : 
    6980          17 :     return CE_None;
    6981             : }
    6982             : 
    6983             : /************************************************************************/
    6984             : /*                  GDALOverviewMagnitudeCorrection()                   */
    6985             : /*                                                                      */
    6986             : /*      Correct the mean and standard deviation of the overviews of     */
    6987             : /*      the given band to match the base layer approximately.           */
    6988             : /************************************************************************/
    6989             : 
    6990             : /** Undocumented
    6991             :  * @param hBaseBand undocumented.
    6992             :  * @param nOverviewCount undocumented.
    6993             :  * @param pahOverviews undocumented.
    6994             :  * @param pfnProgress undocumented.
    6995             :  * @param pProgressData undocumented.
    6996             :  * @return undocumented
    6997             :  */
    6998           0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
    6999             :                                        int nOverviewCount,
    7000             :                                        GDALRasterBandH *pahOverviews,
    7001             :                                        GDALProgressFunc pfnProgress,
    7002             :                                        void *pProgressData)
    7003             : 
    7004             : {
    7005           0 :     VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
    7006             : 
    7007             :     /* -------------------------------------------------------------------- */
    7008             :     /*      Compute mean/stddev for source raster.                          */
    7009             :     /* -------------------------------------------------------------------- */
    7010           0 :     double dfOrigMean = 0.0;
    7011           0 :     double dfOrigStdDev = 0.0;
    7012             :     {
    7013             :         const CPLErr eErr =
    7014           0 :             GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
    7015             :                                  pfnProgress, pProgressData);
    7016             : 
    7017           0 :         if (eErr != CE_None)
    7018           0 :             return eErr;
    7019             :     }
    7020             : 
    7021             :     /* -------------------------------------------------------------------- */
    7022             :     /*      Loop on overview bands.                                         */
    7023             :     /* -------------------------------------------------------------------- */
    7024           0 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    7025             :     {
    7026             :         GDALRasterBand *poOverview =
    7027           0 :             GDALRasterBand::FromHandle(pahOverviews[iOverview]);
    7028             :         double dfOverviewMean, dfOverviewStdDev;
    7029             : 
    7030             :         const CPLErr eErr =
    7031           0 :             GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
    7032             :                                  &dfOverviewStdDev, pfnProgress, pProgressData);
    7033             : 
    7034           0 :         if (eErr != CE_None)
    7035           0 :             return eErr;
    7036             : 
    7037           0 :         double dfGain = 1.0;
    7038           0 :         if (dfOrigStdDev >= 0.0001)
    7039           0 :             dfGain = dfOrigStdDev / dfOverviewStdDev;
    7040             : 
    7041             :         /* --------------------------------------------------------------------
    7042             :          */
    7043             :         /*      Apply gain and offset. */
    7044             :         /* --------------------------------------------------------------------
    7045             :          */
    7046           0 :         const int nWidth = poOverview->GetXSize();
    7047           0 :         const int nHeight = poOverview->GetYSize();
    7048             : 
    7049           0 :         GDALDataType eWrkType = GDT_Unknown;
    7050           0 :         float *pafData = nullptr;
    7051           0 :         const GDALDataType eType = poOverview->GetRasterDataType();
    7052           0 :         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    7053           0 :         if (bComplex)
    7054             :         {
    7055             :             pafData = static_cast<float *>(
    7056           0 :                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    7057           0 :             eWrkType = GDT_CFloat32;
    7058             :         }
    7059             :         else
    7060             :         {
    7061             :             pafData = static_cast<float *>(
    7062           0 :                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    7063           0 :             eWrkType = GDT_Float32;
    7064             :         }
    7065             : 
    7066           0 :         if (pafData == nullptr)
    7067             :         {
    7068           0 :             return CE_Failure;
    7069             :         }
    7070             : 
    7071           0 :         for (int iLine = 0; iLine < nHeight; ++iLine)
    7072             :         {
    7073           0 :             if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    7074             :                              pProgressData))
    7075             :             {
    7076           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    7077           0 :                 CPLFree(pafData);
    7078           0 :                 return CE_Failure;
    7079             :             }
    7080             : 
    7081           0 :             if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
    7082             :                                      nWidth, 1, eWrkType, 0, 0,
    7083           0 :                                      nullptr) != CE_None)
    7084             :             {
    7085           0 :                 CPLFree(pafData);
    7086           0 :                 return CE_Failure;
    7087             :             }
    7088             : 
    7089           0 :             for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    7090             :             {
    7091           0 :                 if (bComplex)
    7092             :                 {
    7093           0 :                     pafData[static_cast<size_t>(iPixel) * 2] *=
    7094           0 :                         static_cast<float>(dfGain);
    7095           0 :                     pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
    7096           0 :                         static_cast<float>(dfGain);
    7097             :                 }
    7098             :                 else
    7099             :                 {
    7100           0 :                     pafData[iPixel] = static_cast<float>(
    7101           0 :                         (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
    7102             :                         dfOrigMean);
    7103             :                 }
    7104             :             }
    7105             : 
    7106           0 :             if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
    7107             :                                      nWidth, 1, eWrkType, 0, 0,
    7108           0 :                                      nullptr) != CE_None)
    7109             :             {
    7110           0 :                 CPLFree(pafData);
    7111           0 :                 return CE_Failure;
    7112             :             }
    7113             :         }
    7114             : 
    7115           0 :         if (!pfnProgress(1.0, nullptr, pProgressData))
    7116             :         {
    7117           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    7118           0 :             CPLFree(pafData);
    7119           0 :             return CE_Failure;
    7120             :         }
    7121             : 
    7122           0 :         CPLFree(pafData);
    7123             :     }
    7124             : 
    7125           0 :     return CE_None;
    7126             : }

Generated by: LCOV version 1.14