LCOV - code coverage report
Current view: top level - gcore - overview.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2756 3136 87.9 %
Date: 2026-04-22 14:22:58 Functions: 175 192 91.1 %

          Line data    Source code
       1             : 
       2             : /******************************************************************************
       3             :  *
       4             :  * Project:  GDAL Core
       5             :  * Purpose:  Helper code to implement overview support in different drivers.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2000, Frank Warmerdam
      10             :  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal_priv.h"
      17             : 
      18             : #include <cmath>
      19             : #include <cstddef>
      20             : #include <cstdlib>
      21             : 
      22             : #include <algorithm>
      23             : #include <complex>
      24             : #include <condition_variable>
      25             : #include <limits>
      26             : #include <list>
      27             : #include <memory>
      28             : #include <mutex>
      29             : #include <vector>
      30             : 
      31             : #include "cpl_conv.h"
      32             : #include "cpl_error.h"
      33             : #include "cpl_float.h"
      34             : #include "cpl_progress.h"
      35             : #include "cpl_vsi.h"
      36             : #include "cpl_worker_thread_pool.h"
      37             : #include "gdal.h"
      38             : #include "gdal_thread_pool.h"
      39             : #include "gdalwarper.h"
      40             : #include "gdal_vrt.h"
      41             : #include "vrtdataset.h"
      42             : 
      43             : #ifdef USE_NEON_OPTIMIZATIONS
      44             : #include "include_sse2neon.h"
      45             : 
      46             : #if (!defined(__aarch64__) && !defined(_M_ARM64))
      47             : #define ARM_V7
      48             : #endif
      49             : 
      50             : #define USE_SSE2
      51             : 
      52             : #include "gdalsse_priv.h"
      53             : 
      54             : // Restrict to 64bit processors because they are guaranteed to have SSE2,
      55             : // or if __AVX2__ is defined.
      56             : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
      57             : #define USE_SSE2
      58             : 
      59             : #include "gdalsse_priv.h"
      60             : 
      61             : #ifdef __SSE3__
      62             : #include <pmmintrin.h>
      63             : #endif
      64             : #ifdef __SSSE3__
      65             : #include <tmmintrin.h>
      66             : #endif
      67             : #ifdef __SSE4_1__
      68             : #include <smmintrin.h>
      69             : #endif
      70             : #ifdef __AVX2__
      71             : #include <immintrin.h>
      72             : #endif
      73             : 
      74             : #endif
      75             : 
      76             : // To be included after above USE_SSE2 and include gdalsse_priv.h
      77             : // to avoid build issue on Windows x86
      78             : #include "gdal_priv_templates.hpp"
      79             : 
      80             : /************************************************************************/
      81             : /*                       GDALResampleChunk_Near()                       */
      82             : /************************************************************************/
      83             : 
      84             : template <class T>
      85        1251 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
      86             :                                       const T *pChunk, T **ppDstBuffer)
      87             : 
      88             : {
      89        1251 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
      90        1251 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
      91        1251 :     const GDALDataType eWrkDataType = args.eWrkDataType;
      92        1251 :     const int nChunkXOff = args.nChunkXOff;
      93        1251 :     const int nChunkXSize = args.nChunkXSize;
      94        1251 :     const int nChunkYOff = args.nChunkYOff;
      95        1251 :     const int nDstXOff = args.nDstXOff;
      96        1251 :     const int nDstXOff2 = args.nDstXOff2;
      97        1251 :     const int nDstYOff = args.nDstYOff;
      98        1251 :     const int nDstYOff2 = args.nDstYOff2;
      99        1251 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
     100             : 
     101             :     /* -------------------------------------------------------------------- */
     102             :     /*      Allocate buffers.                                               */
     103             :     /* -------------------------------------------------------------------- */
     104        1251 :     *ppDstBuffer = static_cast<T *>(
     105        1251 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
     106             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
     107        1251 :     if (*ppDstBuffer == nullptr)
     108             :     {
     109           0 :         return CE_Failure;
     110             :     }
     111        1251 :     T *const pDstBuffer = *ppDstBuffer;
     112             : 
     113             :     int *panSrcXOff =
     114        1251 :         static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
     115             : 
     116        1251 :     if (panSrcXOff == nullptr)
     117             :     {
     118           0 :         return CE_Failure;
     119             :     }
     120             : 
     121             :     /* ==================================================================== */
     122             :     /*      Precompute inner loop constants.                                */
     123             :     /* ==================================================================== */
     124      840888 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
     125             :     {
     126      839637 :         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
     127      839637 :         if (nSrcXOff < nChunkXOff)
     128           0 :             nSrcXOff = nChunkXOff;
     129             : 
     130      839637 :         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
     131             :     }
     132             : 
     133             :     /* ==================================================================== */
     134             :     /*      Loop over destination scanlines.                                */
     135             :     /* ==================================================================== */
     136      142463 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
     137             :     {
     138      141212 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
     139      141212 :         if (nSrcYOff < nChunkYOff)
     140           0 :             nSrcYOff = nChunkYOff;
     141             : 
     142      141212 :         const T *const pSrcScanline =
     143             :             pChunk +
     144      141212 :             (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
     145      137819 :             nChunkXOff;
     146             : 
     147             :         /* --------------------------------------------------------------------
     148             :          */
     149             :         /*      Loop over destination pixels */
     150             :         /* --------------------------------------------------------------------
     151             :          */
     152      141212 :         T *pDstScanline =
     153      141212 :             pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
     154   120252393 :         for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
     155             :         {
     156   120111000 :             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
     157             :         }
     158             :     }
     159             : 
     160        1251 :     CPLFree(panSrcXOff);
     161             : 
     162        1251 :     return CE_None;
     163             : }
     164             : 
     165        1251 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
     166             :                                      const void *pChunk, void **ppDstBuffer,
     167             :                                      GDALDataType *peDstBufferDataType)
     168             : {
     169        1251 :     *peDstBufferDataType = args.eWrkDataType;
     170        1251 :     switch (args.eWrkDataType)
     171             :     {
     172             :         // For nearest resampling, as no computation is done, only the
     173             :         // size of the data type matters.
     174        1083 :         case GDT_UInt8:
     175             :         case GDT_Int8:
     176             :         {
     177        1083 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
     178        1083 :             return GDALResampleChunk_NearT(
     179             :                 args, static_cast<const uint8_t *>(pChunk),
     180        1083 :                 reinterpret_cast<uint8_t **>(ppDstBuffer));
     181             :         }
     182             : 
     183          52 :         case GDT_Int16:
     184             :         case GDT_UInt16:
     185             :         case GDT_Float16:
     186             :         {
     187          52 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
     188          52 :             return GDALResampleChunk_NearT(
     189             :                 args, static_cast<const uint16_t *>(pChunk),
     190          52 :                 reinterpret_cast<uint16_t **>(ppDstBuffer));
     191             :         }
     192             : 
     193          68 :         case GDT_CInt16:
     194             :         case GDT_CFloat16:
     195             :         case GDT_Int32:
     196             :         case GDT_UInt32:
     197             :         case GDT_Float32:
     198             :         {
     199          68 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
     200          68 :             return GDALResampleChunk_NearT(
     201             :                 args, static_cast<const uint32_t *>(pChunk),
     202          68 :                 reinterpret_cast<uint32_t **>(ppDstBuffer));
     203             :         }
     204             : 
     205          44 :         case GDT_CInt32:
     206             :         case GDT_CFloat32:
     207             :         case GDT_Int64:
     208             :         case GDT_UInt64:
     209             :         case GDT_Float64:
     210             :         {
     211          44 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
     212          44 :             return GDALResampleChunk_NearT(
     213             :                 args, static_cast<const uint64_t *>(pChunk),
     214          44 :                 reinterpret_cast<uint64_t **>(ppDstBuffer));
     215             :         }
     216             : 
     217           4 :         case GDT_CFloat64:
     218             :         {
     219           4 :             return GDALResampleChunk_NearT(
     220             :                 args, static_cast<const std::complex<double> *>(pChunk),
     221           4 :                 reinterpret_cast<std::complex<double> **>(ppDstBuffer));
     222             :         }
     223             : 
     224           0 :         case GDT_Unknown:
     225             :         case GDT_TypeCount:
     226           0 :             break;
     227             :     }
     228           0 :     CPLAssert(false);
     229             :     return CE_Failure;
     230             : }
     231             : 
     232             : namespace
     233             : {
     234             : 
     235             : // Find in the color table the entry whose RGB value is the closest
     236             : // (using quadratic distance) to the test color, ignoring transparent entries.
     237        3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
     238             :                    const GDALColorEntry &test)
     239             : {
     240        3837 :     int nMinDist = std::numeric_limits<int>::max();
     241        3837 :     size_t bestEntry = 0;
     242      986109 :     for (size_t i = 0; i < entries.size(); ++i)
     243             :     {
     244      982272 :         const GDALColorEntry &entry = entries[i];
     245             :         // Ignore transparent entries
     246      982272 :         if (entry.c4 == 0)
     247        3237 :             continue;
     248             : 
     249      979035 :         int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
     250      979035 :                     ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
     251      979035 :                     ((test.c3 - entry.c3) * (test.c3 - entry.c3));
     252      979035 :         if (nDist < nMinDist)
     253             :         {
     254       15847 :             nMinDist = nDist;
     255       15847 :             bestEntry = i;
     256             :         }
     257             :     }
     258        3837 :     return static_cast<int>(bestEntry);
     259             : }
     260             : 
     261           7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
     262             :                                            int &transparentIdx)
     263             : {
     264           7 :     std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
     265             : 
     266           7 :     transparentIdx = -1;
     267           7 :     int i = 0;
     268        1799 :     for (auto &entry : entries)
     269             :     {
     270        1792 :         table.GetColorEntryAsRGB(i, &entry);
     271        1792 :         if (transparentIdx < 0 && entry.c4 == 0)
     272           1 :             transparentIdx = i;
     273        1792 :         ++i;
     274             :     }
     275           7 :     return entries;
     276             : }
     277             : 
     278             : }  // unnamed  namespace
     279             : 
     280             : /************************************************************************/
     281             : /*                               SQUARE()                               */
     282             : /************************************************************************/
     283             : 
     284        6427 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
     285             : {
     286        6427 :     return static_cast<Tsquare>(val) * val;
     287             : }
     288             : 
     289             : /************************************************************************/
     290             : /*                         ComputeIntegerRMS()                          */
     291             : /************************************************************************/
     292             : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
     293             : // integer that minimizes abs(rms**2 - sumSquares / weight)
     294             : template <class T, class Twork>
     295          42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
     296             : {
     297          42 :     const double sumDivWeight = sumSquares / weight;
     298          42 :     T rms = static_cast<T>(sqrt(sumDivWeight));
     299             : 
     300             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     301             :     // Naive version:
     302             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     303          42 :     if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
     304          42 :         2 * sumDivWeight)
     305           6 :         rms += 1;
     306          42 :     return rms;
     307             : }
     308             : 
     309             : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
     310             : {
     311             :     CPLAssert(false);
     312             :     return 0;
     313             : }
     314             : 
     315          28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
     316             : {
     317             :     // It has been verified that given the correction on rms below, using
     318             :     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
     319             :     // is equivalent, so use the former as it is used twice.
     320          28 :     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
     321          28 :     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
     322          28 :     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
     323             : 
     324             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     325             :     // Naive version:
     326             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     327             :     // Optimized version for integer case and weight == 4
     328          28 :     if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
     329           5 :         rms += 1;
     330          28 :     return rms;
     331             : }
     332             : 
     333             : template <>
     334          24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
     335             : {
     336          24 :     const double sumDivWeight = sumSquares * 0.25;
     337          24 :     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
     338             : 
     339             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     340             :     // Naive version:
     341             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     342             :     // Optimized version for integer case and weight == 4
     343          24 :     if (static_cast<GUInt32>(rms) * (rms + 1) <
     344          24 :         static_cast<GUInt32>(sumDivWeight + 0.25))
     345           4 :         rms += 1;
     346          24 :     return rms;
     347             : }
     348             : 
     349             : #ifdef USE_SSE2
     350             : 
     351             : /************************************************************************/
     352             : /*                    QuadraticMeanByteSSE2OrAVX2()                     */
     353             : /************************************************************************/
     354             : 
     355             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     356             : #define sse2_packus_epi32 _mm_packus_epi32
     357             : #else
     358      516139 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
     359             : {
     360      516139 :     const auto minus32768_32 = _mm_set1_epi32(-32768);
     361      516139 :     const auto minus32768_16 = _mm_set1_epi16(-32768);
     362      516139 :     a = _mm_add_epi32(a, minus32768_32);
     363      516139 :     b = _mm_add_epi32(b, minus32768_32);
     364      516139 :     a = _mm_packs_epi32(a, b);
     365      516139 :     a = _mm_sub_epi16(a, minus32768_16);
     366      516139 :     return a;
     367             : }
     368             : #endif
     369             : 
     370             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     371             : #define sse2_hadd_epi16 _mm_hadd_epi16
     372             : #else
     373     5064270 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
     374             : {
     375             :     // Horizontal addition of adjacent pairs
     376     5064270 :     const auto mask = _mm_set1_epi32(0xFFFF);
     377             :     const auto horizLo =
     378    15192800 :         _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
     379             :     const auto horizHi =
     380    15192800 :         _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
     381             : 
     382             :     // Recombine low and high parts
     383     5064270 :     return _mm_packs_epi32(horizLo, horizHi);
     384             : }
     385             : #endif
     386             : 
     387             : #ifdef __AVX2__
     388             : 
     389             : #define set1_epi16 _mm256_set1_epi16
     390             : #define set1_epi32 _mm256_set1_epi32
     391             : #define setzero _mm256_setzero_si256
     392             : #define set1_ps _mm256_set1_ps
     393             : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
     394             : #define unpacklo_epi8 _mm256_unpacklo_epi8
     395             : #define unpackhi_epi8 _mm256_unpackhi_epi8
     396             : #define madd_epi16 _mm256_madd_epi16
     397             : #define add_epi32 _mm256_add_epi32
     398             : #define mul_ps _mm256_mul_ps
     399             : #define cvtepi32_ps _mm256_cvtepi32_ps
     400             : #define sqrt_ps _mm256_sqrt_ps
     401             : #define cvttps_epi32 _mm256_cvttps_epi32
     402             : #define packs_epi32 _mm256_packs_epi32
     403             : #define packus_epi32 _mm256_packus_epi32
     404             : #define srli_epi32 _mm256_srli_epi32
     405             : #define mullo_epi16 _mm256_mullo_epi16
     406             : #define srli_epi16 _mm256_srli_epi16
     407             : #define cmpgt_epi16 _mm256_cmpgt_epi16
     408             : #define add_epi16 _mm256_add_epi16
     409             : #define sub_epi16 _mm256_sub_epi16
     410             : #define packus_epi16 _mm256_packus_epi16
     411             : 
     412             : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
     413             : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
     414             :  */
     415             : 
     416             : inline __m256i FIXUP_LANES(__m256i x)
     417             : {
     418             :     return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
     419             : }
     420             : 
     421             : #define store_lo(x, y)                                                         \
     422             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
     423             :                      _mm256_extracti128_si256(FIXUP_LANES(y), 0))
     424             : #define storeu_int(x, y)                                                       \
     425             :     _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
     426             : #define hadd_epi16 _mm256_hadd_epi16
     427             : #else
     428             : #define set1_epi16 _mm_set1_epi16
     429             : #define set1_epi32 _mm_set1_epi32
     430             : #define setzero _mm_setzero_si128
     431             : #define set1_ps _mm_set1_ps
     432             : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
     433             : #define unpacklo_epi8 _mm_unpacklo_epi8
     434             : #define unpackhi_epi8 _mm_unpackhi_epi8
     435             : #define madd_epi16 _mm_madd_epi16
     436             : #define add_epi32 _mm_add_epi32
     437             : #define mul_ps _mm_mul_ps
     438             : #define cvtepi32_ps _mm_cvtepi32_ps
     439             : #define sqrt_ps _mm_sqrt_ps
     440             : #define cvttps_epi32 _mm_cvttps_epi32
     441             : #define packs_epi32 _mm_packs_epi32
     442             : #define packus_epi32 sse2_packus_epi32
     443             : #define srli_epi32 _mm_srli_epi32
     444             : #define mullo_epi16 _mm_mullo_epi16
     445             : #define srli_epi16 _mm_srli_epi16
     446             : #define cmpgt_epi16 _mm_cmpgt_epi16
     447             : #define add_epi16 _mm_add_epi16
     448             : #define sub_epi16 _mm_sub_epi16
     449             : #define packus_epi16 _mm_packus_epi16
     450             : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
     451             : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
     452             : #define hadd_epi16 sse2_hadd_epi16
     453             : #endif
     454             : 
     455             : template <class T>
     456             : static int
     457             : #if defined(__GNUC__)
     458             :     __attribute__((noinline))
     459             : #endif
     460        5389 :     QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     461             :                                 const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     462             :                                 T *CPL_RESTRICT pDstScanline)
     463             : {
     464             :     // Optimized implementation for RMS on Byte by
     465             :     // processing by group of 8 output pixels, so as to use
     466             :     // a single _mm_sqrt_ps() call for 4 output pixels
     467        5389 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     468             : 
     469        5389 :     int iDstPixel = 0;
     470        5389 :     const auto one16 = set1_epi16(1);
     471        5389 :     const auto one32 = set1_epi32(1);
     472        5389 :     const auto zero = setzero();
     473        5389 :     const auto minus32768 = set1_epi16(-32768);
     474             : 
     475        5389 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
     476      521504 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     477             :     {
     478             :         // Load 2 * DEST_ELTS bytes from each line
     479      516115 :         auto firstLine = loadu_int(pSrcScanlineShifted);
     480     1032230 :         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     481             :         // Extend those Bytes as UInt16s
     482      516115 :         auto firstLineLo = unpacklo_epi8(firstLine, zero);
     483      516115 :         auto firstLineHi = unpackhi_epi8(firstLine, zero);
     484      516115 :         auto secondLineLo = unpacklo_epi8(secondLine, zero);
     485      516115 :         auto secondLineHi = unpackhi_epi8(secondLine, zero);
     486             : 
     487             :         // Multiplication of 16 bit values and horizontal
     488             :         // addition of 32 bit results
     489             :         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
     490      516115 :         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
     491      516115 :         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
     492      516115 :         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
     493      516115 :         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
     494             : 
     495             :         // Vertical addition
     496      516115 :         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
     497      516115 :         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
     498             : 
     499             :         const auto sumSquaresPlusOneDiv4Lo =
     500     1032230 :             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
     501             :         const auto sumSquaresPlusOneDiv4Hi =
     502     1032230 :             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
     503             : 
     504             :         // Take square root and truncate/floor to int32
     505             :         const auto rmsLo =
     506     1548340 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
     507             :         const auto rmsHi =
     508     1548340 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
     509             : 
     510             :         // Merge back low and high registers with each RMS value
     511             :         // as a 16 bit value.
     512      516115 :         auto rms = packs_epi32(rmsLo, rmsHi);
     513             : 
     514             :         // Round to upper value if it minimizes the
     515             :         // error |rms^2 - sumSquares/4|
     516             :         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     517             :         //    rms += 1;
     518             :         // which is equivalent to:
     519             :         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
     520             :         //    rms += 1;
     521             :         // And both left and right parts fit on 16 (unsigned) bits
     522             :         const auto sumSquaresPlusOneDiv4 =
     523      516115 :             packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
     524             :         // cmpgt_epi16 operates on signed int16, but here
     525             :         // we have unsigned values, so shift them by -32768 before
     526     2580580 :         const auto mask = cmpgt_epi16(
     527             :             add_epi16(sumSquaresPlusOneDiv4, minus32768),
     528             :             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
     529             :         // The value of the mask will be -1 when the correction needs to be
     530             :         // applied
     531      516115 :         rms = sub_epi16(rms, mask);
     532             : 
     533             :         // Pack each 16 bit RMS value to 8 bits
     534      516115 :         rms = packus_epi16(rms, rms /* could be anything */);
     535      516115 :         store_lo(&pDstScanline[iDstPixel], rms);
     536      516115 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     537             :     }
     538             : 
     539        5389 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     540        5389 :     return iDstPixel;
     541             : }
     542             : 
     543             : /************************************************************************/
     544             : /*                       AverageByteSSE2OrAVX2()                        */
     545             : /************************************************************************/
     546             : 
     547             : static int
     548      123976 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     549             :                       const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     550             :                       GByte *CPL_RESTRICT pDstScanline)
     551             : {
     552             :     // Optimized implementation for average on Byte by
     553             :     // processing by group of 16 output pixels for SSE2, or 32 for AVX2
     554             : 
     555      123976 :     const auto zero = setzero();
     556      123976 :     const auto two16 = set1_epi16(2);
     557      123976 :     const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     558             : 
     559      123976 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
     560      123976 :     int iDstPixel = 0;
     561     2656110 :     for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
     562     2532130 :          iDstPixel += 2 * DEST_ELTS)
     563             :     {
     564             :         decltype(setzero()) average0;
     565             :         {
     566             :             // Load 2 * DEST_ELTS bytes from each line
     567     2532130 :             const auto firstLine = loadu_int(pSrcScanlineShifted);
     568             :             const auto secondLine =
     569     5064270 :                 loadu_int(pSrcScanlineShifted + nChunkXSize);
     570             :             // Extend those Bytes as UInt16s
     571     2532130 :             const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     572     2532130 :             const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     573     2532130 :             const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     574     2532130 :             const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     575             : 
     576             :             // Vertical addition
     577     2532130 :             const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     578     2532130 :             const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     579             : 
     580             :             // Horizontal addition of adjacent pairs, and recombine low and high
     581             :             // parts
     582     2532130 :             const auto sum = hadd_epi16(sumLo, sumHi);
     583             : 
     584             :             // average = (sum + 2) / 4
     585     2532130 :             average0 = srli_epi16(add_epi16(sum, two16), 2);
     586             : 
     587     2532130 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     588             :         }
     589             : 
     590             :         decltype(setzero()) average1;
     591             :         {
     592             :             // Load 2 * DEST_ELTS bytes from each line
     593     2532130 :             const auto firstLine = loadu_int(pSrcScanlineShifted);
     594             :             const auto secondLine =
     595     5064270 :                 loadu_int(pSrcScanlineShifted + nChunkXSize);
     596             :             // Extend those Bytes as UInt16s
     597     2532130 :             const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     598     2532130 :             const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     599     2532130 :             const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     600     2532130 :             const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     601             : 
     602             :             // Vertical addition
     603     2532130 :             const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     604     2532130 :             const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     605             : 
     606             :             // Horizontal addition of adjacent pairs, and recombine low and high
     607             :             // parts
     608     2532130 :             const auto sum = hadd_epi16(sumLo, sumHi);
     609             : 
     610             :             // average = (sum + 2) / 4
     611     2532130 :             average1 = srli_epi16(add_epi16(sum, two16), 2);
     612             : 
     613     2532130 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     614             :         }
     615             : 
     616             :         // Pack each 16 bit average value to 8 bits
     617     2532130 :         const auto average = packus_epi16(average0, average1);
     618     2532130 :         storeu_int(&pDstScanline[iDstPixel], average);
     619             :     }
     620             : 
     621      123976 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     622      123976 :     return iDstPixel;
     623             : }
     624             : 
     625             : /************************************************************************/
     626             : /*                      QuadraticMeanUInt16SSE2()                       */
     627             : /************************************************************************/
     628             : 
     629             : #ifdef __SSE3__
     630             : #define sse2_hadd_pd _mm_hadd_pd
     631             : #else
     632         185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
     633             : {
     634             :     auto aLo_bLo =
     635         740 :         _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
     636             :     auto aHi_bHi =
     637         740 :         _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
     638         185 :     return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
     639             : }
     640             : #endif
     641             : 
     642         120 : inline __m128d SQUARE_PD(__m128d x)
     643             : {
     644         120 :     return _mm_mul_pd(x, x);
     645             : }
     646             : 
     647             : #ifdef __AVX2__
     648             : 
     649             : inline __m256d SQUARE_PD(__m256d x)
     650             : {
     651             :     return _mm256_mul_pd(x, x);
     652             : }
     653             : 
     654             : inline __m256d FIXUP_LANES(__m256d x)
     655             : {
     656             :     return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
     657             : }
     658             : 
     659             : inline __m256 FIXUP_LANES(__m256 x)
     660             : {
     661             :     return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
     662             : }
     663             : 
     664             : #endif
     665             : 
     666             : static int
     667          14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
     668             :                         const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     669             :                         uint16_t *CPL_RESTRICT pDstScanline)
     670             : {
     671             :     // Optimized implementation for RMS on UInt16 by
     672             :     // processing by group of 4 output pixels.
     673          14 :     const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     674             : 
     675          14 :     int iDstPixel = 0;
     676          14 :     const auto zero = _mm_setzero_si128();
     677             : 
     678             : #ifdef __AVX2__
     679             :     const auto zeroDot25 = _mm256_set1_pd(0.25);
     680             :     const auto zeroDot5 = _mm256_set1_pd(0.5);
     681             : 
     682             :     // The first four 0's could be anything, as we only take the bottom
     683             :     // 128 bits.
     684             :     const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
     685             : #else
     686          14 :     const auto zeroDot25 = _mm_set1_pd(0.25);
     687          14 :     const auto zeroDot5 = _mm_set1_pd(0.5);
     688             : #endif
     689             : 
     690          14 :     constexpr int DEST_ELTS =
     691             :         static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
     692          52 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     693             :     {
     694             :         // Load 8 UInt16 from each line
     695          38 :         const auto firstLine = _mm_loadu_si128(
     696             :             reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     697             :         const auto secondLine =
     698          38 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     699          38 :                 pSrcScanlineShifted + nChunkXSize));
     700             : 
     701             :         // Detect if all of the source values fit in 14 bits.
     702             :         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
     703             :         // and we can do a much faster implementation.
     704             :         const auto maskTmp =
     705          76 :             _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
     706             : #if defined(__i386__) || defined(_M_IX86)
     707             :         uint64_t nMaskFitsIn14Bits = 0;
     708             :         _mm_storel_epi64(
     709             :             reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
     710             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     711             : #else
     712          38 :         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
     713             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     714             : #endif
     715          38 :         if (nMaskFitsIn14Bits == 0)
     716             :         {
     717             :             // Multiplication of 16 bit values and horizontal
     718             :             // addition of 32 bit results
     719             :             const auto firstLineHSumSquare =
     720          26 :                 _mm_madd_epi16(firstLine, firstLine);
     721             :             const auto secondLineHSumSquare =
     722          26 :                 _mm_madd_epi16(secondLine, secondLine);
     723             :             // Vertical addition
     724             :             const auto sumSquares =
     725          26 :                 _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
     726             :             // In theory we should take sqrt(sumSquares * 0.25f)
     727             :             // but given the rounding we do, this is equivalent to
     728             :             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
     729             :             // sumSquares <= 4 * 16383^2
     730          26 :             const auto one32 = _mm_set1_epi32(1);
     731             :             const auto sumSquaresPlusOneDiv4 =
     732          52 :                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
     733             :             // Take square root and truncate/floor to int32
     734          78 :             auto rms = _mm_cvttps_epi32(
     735             :                 _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
     736             : 
     737             :             // Round to upper value if it minimizes the
     738             :             // error |rms^2 - sumSquares/4|
     739             :             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     740             :             //    rms += 1;
     741             :             // which is equivalent to:
     742             :             // if( rms * rms + rms < (sumSquares+1) / 4 )
     743             :             //    rms += 1;
     744             :             auto mask =
     745          78 :                 _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
     746             :                                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
     747          26 :             rms = _mm_sub_epi32(rms, mask);
     748             :             // Pack each 32 bit RMS value to 16 bits
     749          26 :             rms = _mm_packs_epi32(rms, rms /* could be anything */);
     750             :             _mm_storel_epi64(
     751          26 :                 reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
     752          26 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     753          26 :             continue;
     754             :         }
     755             : 
     756             :         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
     757             :         // to 32 bit would result in 4 multiplications instead of 8, but
     758             :         // mullo/mulhi have a worse throughput than mul_pd.
     759             : 
     760             :         // Extend those UInt16s as UInt32s
     761          12 :         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
     762          12 :         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
     763          12 :         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
     764          12 :         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
     765             : 
     766             : #ifdef __AVX2__
     767             :         // Multiplication of 32 bit values previously converted to 64 bit double
     768             :         const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
     769             :         const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
     770             :         const auto secondLineLoDbl =
     771             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
     772             :         const auto secondLineHiDbl =
     773             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
     774             : 
     775             :         // Vertical addition of squares
     776             :         const auto sumSquaresLo =
     777             :             _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
     778             :         const auto sumSquaresHi =
     779             :             _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
     780             : 
     781             :         // Horizontal addition of squares
     782             :         const auto sumSquares =
     783             :             FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
     784             : 
     785             :         const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
     786             : 
     787             :         // Take square root and truncate/floor to int32
     788             :         auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
     789             :         const auto rmsDouble = _mm256_cvtepi32_pd(rms);
     790             :         const auto right = _mm256_sub_pd(
     791             :             sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
     792             : 
     793             :         auto mask =
     794             :             _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
     795             :         // Extract 32-bit from each of the 4 64-bit masks
     796             :         // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
     797             :         // _MM_SHUFFLE(2,0,2,0)));
     798             :         mask = _mm256_permutevar8x32_ps(mask, permutation);
     799             :         const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
     800             : 
     801             :         // Apply the correction
     802             :         rms = _mm_sub_epi32(rms, maskI);
     803             : 
     804             :         // Pack each 32 bit RMS value to 16 bits
     805             :         rms = _mm_packus_epi32(rms, rms /* could be anything */);
     806             : #else
     807             :         // Multiplication of 32 bit values previously converted to 64 bit double
     808          12 :         const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
     809             :         const auto firstLineLoHi =
     810          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
     811          12 :         const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
     812             :         const auto firstLineHiHi =
     813          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
     814             : 
     815          12 :         const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
     816             :         const auto secondLineLoHi =
     817          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
     818          12 :         const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
     819             :         const auto secondLineHiHi =
     820          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
     821             : 
     822             :         // Vertical addition of squares
     823          12 :         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
     824          12 :         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
     825          12 :         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
     826          12 :         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
     827             : 
     828             :         // Horizontal addition of squares
     829          12 :         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
     830          12 :         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
     831             : 
     832          12 :         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
     833          12 :         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
     834             :         // Take square root and truncate/floor to int32
     835          24 :         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
     836          24 :         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
     837             : 
     838             :         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
     839             :         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
     840             :         //     rms += 1;
     841          12 :         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
     842          12 :         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
     843          24 :         const auto rightLo = _mm_sub_pd(
     844             :             sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
     845          36 :         const auto rightHi = _mm_sub_pd(
     846             :             sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
     847             : 
     848          24 :         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
     849          12 :         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
     850             :         // The value of the mask will be -1 when the correction needs to be
     851             :         // applied
     852          24 :         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
     853             :             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
     854             : 
     855          48 :         auto rms = _mm_castps_si128(
     856             :             _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
     857             :         // Apply the correction
     858          12 :         rms = _mm_sub_epi32(rms, mask);
     859             : 
     860             :         // Pack each 32 bit RMS value to 16 bits
     861          12 :         rms = sse2_packus_epi32(rms, rms /* could be anything */);
     862             : #endif
     863             : 
     864          12 :         _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     865             :                          rms);
     866          12 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     867             :     }
     868             : 
     869          14 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     870          14 :     return iDstPixel;
     871             : }
     872             : 
     873             : /************************************************************************/
     874             : /*                         AverageUInt16SSE2()                          */
     875             : /************************************************************************/
     876             : 
     877             : static int
     878          13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
     879             :                   const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     880             :                   uint16_t *CPL_RESTRICT pDstScanline)
     881             : {
     882             :     // Optimized implementation for average on UInt16 by
     883             :     // processing by group of 8 output pixels.
     884             : 
     885          13 :     const auto mask = _mm_set1_epi32(0xFFFF);
     886          13 :     const auto two = _mm_set1_epi32(2);
     887          13 :     const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     888             : 
     889          13 :     int iDstPixel = 0;
     890          13 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
     891          25 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     892             :     {
     893             :         __m128i averageLow;
     894             :         // Load 8 UInt16 from each line
     895             :         {
     896          12 :             const auto firstLine = _mm_loadu_si128(
     897             :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     898             :             const auto secondLine =
     899          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     900          12 :                     pSrcScanlineShifted + nChunkXSize));
     901             : 
     902             :             // Horizontal addition and extension to 32 bit
     903          36 :             const auto horizAddFirstLine = _mm_add_epi32(
     904             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     905             :             const auto horizAddSecondLine =
     906          36 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     907             :                               _mm_srli_epi32(secondLine, 16));
     908             : 
     909             :             // Vertical addition and average computation
     910             :             // average = (sum + 2) >> 2
     911          24 :             const auto sum = _mm_add_epi32(
     912             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     913          12 :             averageLow = _mm_srli_epi32(sum, 2);
     914             :         }
     915             :         // Load 8 UInt16 from each line
     916             :         __m128i averageHigh;
     917             :         {
     918             :             const auto firstLine =
     919          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     920          12 :                     pSrcScanlineShifted + DEST_ELTS));
     921             :             const auto secondLine =
     922          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     923          12 :                     pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
     924             : 
     925             :             // Horizontal addition and extension to 32 bit
     926          36 :             const auto horizAddFirstLine = _mm_add_epi32(
     927             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     928             :             const auto horizAddSecondLine =
     929          36 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     930             :                               _mm_srli_epi32(secondLine, 16));
     931             : 
     932             :             // Vertical addition and average computation
     933             :             // average = (sum + 2) >> 2
     934          24 :             const auto sum = _mm_add_epi32(
     935             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     936          12 :             averageHigh = _mm_srli_epi32(sum, 2);
     937             :         }
     938             : 
     939             :         // Pack each 32 bit average value to 16 bits
     940          12 :         auto average = sse2_packus_epi32(averageLow, averageHigh);
     941          12 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     942             :                          average);
     943          12 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     944             :     }
     945             : 
     946          13 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     947          13 :     return iDstPixel;
     948             : }
     949             : 
     950             : /************************************************************************/
     951             : /*                       QuadraticMeanFloatSSE2()                       */
     952             : /************************************************************************/
     953             : 
     954             : #if !defined(ARM_V7)
     955             : 
     956             : #ifdef __SSE3__
     957             : #define sse2_hadd_ps _mm_hadd_ps
     958             : #else
     959          82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
     960             : {
     961          82 :     auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
     962          82 :     auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
     963          82 :     return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
     964             : }
     965             : #endif
     966             : 
     967             : #ifdef __AVX2__
     968             : #define set1_ps _mm256_set1_ps
     969             : #define loadu_ps _mm256_loadu_ps
     970             : #define andnot_ps _mm256_andnot_ps
     971             : #define and_ps _mm256_and_ps
     972             : #define max_ps _mm256_max_ps
     973             : #define shuffle_ps _mm256_shuffle_ps
     974             : #define div_ps _mm256_div_ps
     975             : #define cmpeq_ps(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OQ)
     976             : #define mul_ps _mm256_mul_ps
     977             : #define add_ps _mm256_add_ps
     978             : #define hadd_ps _mm256_hadd_ps
     979             : #define sqrt_ps _mm256_sqrt_ps
     980             : #define or_ps _mm256_or_ps
     981             : #define unpacklo_ps _mm256_unpacklo_ps
     982             : #define unpackhi_ps _mm256_unpackhi_ps
     983             : #define storeu_ps _mm256_storeu_ps
     984             : #define blendv_ps _mm256_blendv_ps
     985             : 
     986             : inline __m256 SQUARE_PS(__m256 x)
     987             : {
     988             :     return _mm256_mul_ps(x, x);
     989             : }
     990             : 
     991             : #else
     992             : 
     993             : #define set1_ps _mm_set1_ps
     994             : #define loadu_ps _mm_loadu_ps
     995             : #define andnot_ps _mm_andnot_ps
     996             : #define and_ps _mm_and_ps
     997             : #define max_ps _mm_max_ps
     998             : #define shuffle_ps _mm_shuffle_ps
     999             : #define div_ps _mm_div_ps
    1000             : #define cmpeq_ps _mm_cmpeq_ps
    1001             : #define mul_ps _mm_mul_ps
    1002             : #define add_ps _mm_add_ps
    1003             : #define hadd_ps sse2_hadd_ps
    1004             : #define sqrt_ps _mm_sqrt_ps
    1005             : #define or_ps _mm_or_ps
    1006             : #define unpacklo_ps _mm_unpacklo_ps
    1007             : #define unpackhi_ps _mm_unpackhi_ps
    1008             : #define storeu_ps _mm_storeu_ps
    1009             : 
    1010         132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
    1011             : {
    1012             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1013             :     return _mm_blendv_ps(a, b, mask);
    1014             : #else
    1015         396 :     return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
    1016             : #endif
    1017             : }
    1018             : 
    1019         528 : inline __m128 SQUARE_PS(__m128 x)
    1020             : {
    1021         528 :     return _mm_mul_ps(x, x);
    1022             : }
    1023             : 
    1024         132 : inline __m128 FIXUP_LANES(__m128 x)
    1025             : {
    1026         132 :     return x;
    1027             : }
    1028             : 
    1029             : #endif
    1030             : 
    1031             : static int
    1032             : #if defined(__GNUC__)
    1033             :     __attribute__((noinline))
    1034             : #endif
    1035          66 :     QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
    1036             :                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1037             :                            float *CPL_RESTRICT pDstScanline)
    1038             : {
    1039             :     // Optimized implementation for RMS on Float32 by
    1040             :     // processing by group of output pixels.
    1041          66 :     const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1042             : 
    1043          66 :     int iDstPixel = 0;
    1044          66 :     const auto minus_zero = set1_ps(-0.0f);
    1045          66 :     const auto zeroDot25 = set1_ps(0.25f);
    1046          66 :     const auto one = set1_ps(1.0f);
    1047          66 :     const auto infv = set1_ps(std::numeric_limits<float>::infinity());
    1048          66 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
    1049             : 
    1050         198 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1051             :     {
    1052             :         // Load 2*DEST_ELTS Float32 from each line
    1053         132 :         auto firstLineLo = loadu_ps(pSrcScanlineShifted);
    1054         132 :         auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
    1055         132 :         auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
    1056             :         auto secondLineHi =
    1057         264 :             loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
    1058             : 
    1059             :         // Take the absolute value
    1060         132 :         firstLineLo = andnot_ps(minus_zero, firstLineLo);
    1061         132 :         firstLineHi = andnot_ps(minus_zero, firstLineHi);
    1062         132 :         secondLineLo = andnot_ps(minus_zero, secondLineLo);
    1063         132 :         secondLineHi = andnot_ps(minus_zero, secondLineHi);
    1064             : 
    1065             :         auto firstLineEven =
    1066         132 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1067             :         auto firstLineOdd =
    1068         132 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1069             :         auto secondLineEven =
    1070         132 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1071             :         auto secondLineOdd =
    1072         132 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1073             : 
    1074             :         // Compute the maximum of each DEST_ELTS value to RMS-average
    1075         396 :         const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
    1076             :                                  max_ps(secondLineEven, secondLineOdd));
    1077             : 
    1078             :         // Normalize each value by the maximum of the DEST_ELTS ones.
    1079             :         // This step is important to avoid that the square evaluates to infinity
    1080             :         // for sufficiently big input.
    1081         132 :         auto invMax = div_ps(one, maxV);
    1082             :         // Deal with 0 being the maximum to correct division by zero
    1083             :         // note: comparing to -0 leads to identical results as to comparing with
    1084             :         // 0
    1085         264 :         invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
    1086             : 
    1087         132 :         firstLineEven = mul_ps(firstLineEven, invMax);
    1088         132 :         firstLineOdd = mul_ps(firstLineOdd, invMax);
    1089         132 :         secondLineEven = mul_ps(secondLineEven, invMax);
    1090         132 :         secondLineOdd = mul_ps(secondLineOdd, invMax);
    1091             : 
    1092             :         // Compute squares
    1093         132 :         firstLineEven = SQUARE_PS(firstLineEven);
    1094         132 :         firstLineOdd = SQUARE_PS(firstLineOdd);
    1095         132 :         secondLineEven = SQUARE_PS(secondLineEven);
    1096         132 :         secondLineOdd = SQUARE_PS(secondLineOdd);
    1097             : 
    1098         396 :         const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
    1099             :                                        add_ps(secondLineEven, secondLineOdd));
    1100             : 
    1101         396 :         auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
    1102             : 
    1103             :         // Deal with infinity being the maximum
    1104         132 :         const auto maskIsInf = cmpeq_ps(maxV, infv);
    1105         132 :         rms = blendv_ps(rms, infv, maskIsInf);
    1106             : 
    1107         132 :         rms = FIXUP_LANES(rms);
    1108             : 
    1109         132 :         storeu_ps(&pDstScanline[iDstPixel], rms);
    1110         132 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1111             :     }
    1112             : 
    1113          66 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1114          66 :     return iDstPixel;
    1115             : }
    1116             : 
    1117             : /************************************************************************/
    1118             : /*                          AverageFloatSSE2()                          */
    1119             : /************************************************************************/
    1120             : 
    1121          50 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
    1122             :                             const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1123             :                             float *CPL_RESTRICT pDstScanline)
    1124             : {
    1125             :     // Optimized implementation for average on Float32 by
    1126             :     // processing by group of output pixels.
    1127          50 :     const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1128             : 
    1129          50 :     int iDstPixel = 0;
    1130          50 :     const auto zeroDot25 = _mm_set1_ps(0.25f);
    1131          50 :     constexpr int DEST_ELTS =
    1132             :         static_cast<int>(sizeof(zeroDot25) / sizeof(float));
    1133             : 
    1134         132 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1135             :     {
    1136             :         // Load 2 * DEST_ELTS Float32 from each line
    1137             :         const auto firstLineLo =
    1138          82 :             _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
    1139         164 :         const auto firstLineHi = _mm_mul_ps(
    1140             :             _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
    1141          82 :         const auto secondLineLo = _mm_mul_ps(
    1142          82 :             _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
    1143         164 :         const auto secondLineHi = _mm_mul_ps(
    1144          82 :             _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
    1145             :             zeroDot25);
    1146             : 
    1147             :         // Vertical addition
    1148          82 :         const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
    1149          82 :         const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
    1150             : 
    1151             :         // Horizontal addition
    1152          82 :         const auto average = sse2_hadd_ps(tmpLo, tmpHi);
    1153             : 
    1154          82 :         _mm_storeu_ps(&pDstScanline[iDstPixel], average);
    1155          82 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1156             :     }
    1157             : 
    1158          50 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1159          50 :     return iDstPixel;
    1160             : }
    1161             : 
    1162             : /************************************************************************/
    1163             : /*                         AverageDoubleSSE2()                          */
    1164             : /************************************************************************/
    1165             : 
    1166             : static int
    1167          50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
    1168             :                   const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1169             :                   double *CPL_RESTRICT pDstScanline)
    1170             : {
    1171             :     // Optimized implementation for average on Float64 by
    1172             :     // processing by group of output pixels.
    1173          50 :     const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1174             : 
    1175          50 :     int iDstPixel = 0;
    1176          50 :     const auto zeroDot25 = _mm_set1_pd(0.25);
    1177          50 :     constexpr int DEST_ELTS =
    1178             :         static_cast<int>(sizeof(zeroDot25) / sizeof(double));
    1179             : 
    1180         211 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1181             :     {
    1182             :         // Load 4 * DEST_ELTS Float64 from each line
    1183         161 :         const auto firstLine0 = _mm_mul_pd(
    1184             :             _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
    1185         322 :         const auto firstLine1 = _mm_mul_pd(
    1186             :             _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
    1187         161 :         const auto secondLine0 = _mm_mul_pd(
    1188         161 :             _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
    1189             :             zeroDot25);
    1190         322 :         const auto secondLine1 = _mm_mul_pd(
    1191         161 :             _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
    1192             :             zeroDot25);
    1193             : 
    1194             :         // Vertical addition
    1195         161 :         const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
    1196         161 :         const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
    1197             : 
    1198             :         // Horizontal addition
    1199         161 :         const auto average0 = sse2_hadd_pd(tmp0, tmp1);
    1200             : 
    1201         161 :         _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
    1202         161 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1203             :     }
    1204             : 
    1205          50 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1206          50 :     return iDstPixel;
    1207             : }
    1208             : 
    1209             : #endif
    1210             : 
    1211             : #endif
    1212             : 
    1213             : /************************************************************************/
    1214             : /*                   GDALResampleChunk_AverageOrRMS()                   */
    1215             : /************************************************************************/
    1216             : 
    1217             : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
    1218             : static CPLErr
    1219        7362 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
    1220             :                                  const T *pChunk, void **ppDstBuffer)
    1221             : {
    1222        7362 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1223        7362 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1224        7362 :     const double dfSrcXDelta = args.dfSrcXDelta;
    1225        7362 :     const double dfSrcYDelta = args.dfSrcYDelta;
    1226        7362 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1227        7362 :     const int nChunkXOff = args.nChunkXOff;
    1228        7362 :     const int nChunkYOff = args.nChunkYOff;
    1229        7362 :     const int nChunkXSize = args.nChunkXSize;
    1230        7362 :     const int nChunkYSize = args.nChunkYSize;
    1231        7362 :     const int nDstXOff = args.nDstXOff;
    1232        7362 :     const int nDstXOff2 = args.nDstXOff2;
    1233        7362 :     const int nDstYOff = args.nDstYOff;
    1234        7362 :     const int nDstYOff2 = args.nDstYOff2;
    1235        7362 :     const char *pszResampling = args.pszResampling;
    1236        7362 :     bool bHasNoData = args.bHasNoData;
    1237        7362 :     const double dfNoDataValue = args.dfNoDataValue;
    1238        7362 :     const GDALColorTable *const poColorTable =
    1239             :         !bQuadraticMean &&
    1240             :                 // AVERAGE_BIT2GRAYSCALE
    1241        7279 :                 CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"))
    1242             :             ? nullptr
    1243             :             : args.poColorTable;
    1244        7362 :     const bool bPropagateNoData = args.bPropagateNoData;
    1245             : 
    1246        7362 :     T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
    1247        7362 :     const T tReplacementVal =
    1248         206 :         bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
    1249          72 :                          args.eOvrDataType, dfNoDataValue))
    1250             :                    : 0;
    1251             : 
    1252        7362 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1253        7362 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1254        7362 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    1255             : 
    1256             :     /* -------------------------------------------------------------------- */
    1257             :     /*      Allocate buffers.                                               */
    1258             :     /* -------------------------------------------------------------------- */
    1259        7362 :     *ppDstBuffer = static_cast<T *>(
    1260        7362 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
    1261             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
    1262        7362 :     if (*ppDstBuffer == nullptr)
    1263             :     {
    1264           0 :         return CE_Failure;
    1265             :     }
    1266        7362 :     T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
    1267             : 
    1268             :     struct PrecomputedXValue
    1269             :     {
    1270             :         int nLeftXOffShifted;
    1271             :         int nRightXOffShifted;
    1272             :         double dfLeftWeight;
    1273             :         double dfRightWeight;
    1274             :         double dfTotalWeightFullLine;
    1275             :     };
    1276             : 
    1277             :     PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
    1278        7362 :         VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
    1279             : 
    1280        7362 :     if (pasSrcX == nullptr)
    1281             :     {
    1282           0 :         return CE_Failure;
    1283             :     }
    1284             : 
    1285        7362 :     std::vector<GDALColorEntry> colorEntries;
    1286             : 
    1287        7362 :     if (poColorTable)
    1288             :     {
    1289           5 :         int nTransparentIdx = -1;
    1290           5 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1291             : 
    1292             :         // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1293             :         // it as nodata value
    1294           6 :         if (bHasNoData && dfNoDataValue >= 0.0 &&
    1295           1 :             tNoDataValue < colorEntries.size())
    1296           1 :             colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
    1297             : 
    1298             :         // Or if we have no explicit nodata, but a color table entry that is
    1299             :         // transparent, consider it as the nodata value
    1300           4 :         else if (!bHasNoData && nTransparentIdx >= 0)
    1301             :         {
    1302           0 :             bHasNoData = true;
    1303           0 :             tNoDataValue = static_cast<T>(nTransparentIdx);
    1304             :         }
    1305             :     }
    1306             : 
    1307             :     /* ==================================================================== */
    1308             :     /*      Precompute inner loop constants.                                */
    1309             :     /* ==================================================================== */
    1310        7362 :     bool bSrcXSpacingIsTwo = true;
    1311        7362 :     int nLastSrcXOff2 = -1;
    1312     1689160 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1313             :     {
    1314     1681805 :         const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    1315             :         // Apply some epsilon to avoid numerical precision issues
    1316     1681805 :         int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    1317     1681805 :         const double dfSrcXOff2 =
    1318     1681805 :             dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    1319     1681805 :         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    1320             : 
    1321     1681805 :         if (nSrcXOff < nChunkXOff)
    1322           0 :             nSrcXOff = nChunkXOff;
    1323     1681805 :         if (nSrcXOff2 == nSrcXOff)
    1324           0 :             nSrcXOff2++;
    1325     1681805 :         if (nSrcXOff2 > nChunkRightXOff)
    1326           1 :             nSrcXOff2 = nChunkRightXOff;
    1327             : 
    1328     1681805 :         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
    1329     1681805 :         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
    1330     1681805 :             nSrcXOff2 - nChunkXOff;
    1331          21 :         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
    1332     1681805 :             (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
    1333     1681805 :         pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
    1334     1681805 :             1 - (nSrcXOff2 - dfSrcXOff2);
    1335     1681805 :         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
    1336     1681805 :             pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
    1337     1681805 :         if (nSrcXOff + 1 < nSrcXOff2)
    1338             :         {
    1339     1681779 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1340     1681779 :                 nSrcXOff2 - nSrcXOff - 2;
    1341     1681779 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1342     1681779 :                 pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
    1343             :         }
    1344             : 
    1345     1681805 :         if (nSrcXOff2 - nSrcXOff != 2 ||
    1346     1583882 :             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
    1347             :         {
    1348       91989 :             bSrcXSpacingIsTwo = false;
    1349             :         }
    1350     1681805 :         nLastSrcXOff2 = nSrcXOff2;
    1351             :     }
    1352             : 
    1353             :     /* ==================================================================== */
    1354             :     /*      Loop over destination scanlines.                                */
    1355             :     /* ==================================================================== */
    1356      705422 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1357             :     {
    1358      698060 :         const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    1359      698060 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    1360      698060 :         if (nSrcYOff < nChunkYOff)
    1361           0 :             nSrcYOff = nChunkYOff;
    1362             : 
    1363      698060 :         const double dfSrcYOff2 =
    1364      698060 :             dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    1365      698060 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    1366      698060 :         if (nSrcYOff2 == nSrcYOff)
    1367           0 :             ++nSrcYOff2;
    1368      698060 :         if (nSrcYOff2 > nChunkBottomYOff)
    1369           3 :             nSrcYOff2 = nChunkBottomYOff;
    1370             : 
    1371      698060 :         T *const pDstScanline =
    1372      698060 :             pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
    1373             : 
    1374             :         /* --------------------------------------------------------------------
    1375             :          */
    1376             :         /*      Loop over destination pixels */
    1377             :         /* --------------------------------------------------------------------
    1378             :          */
    1379      698060 :         if (poColorTable == nullptr)
    1380             :         {
    1381      697945 :             if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
    1382             :                 pabyChunkNodataMask == nullptr)
    1383             :             {
    1384             :                 if constexpr (eWrkDataType == GDT_UInt8 ||
    1385             :                               eWrkDataType == GDT_UInt16)
    1386             :                 {
    1387             :                     // Optimized case : no nodata, overview by a factor of 2 and
    1388             :                     // regular x and y src spacing.
    1389      129392 :                     const T *pSrcScanlineShifted =
    1390      129392 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1391      129392 :                         static_cast<size_t>(nSrcYOff - nChunkYOff) *
    1392      129392 :                             nChunkXSize;
    1393      129392 :                     int iDstPixel = 0;
    1394             : #ifdef USE_SSE2
    1395             :                     if constexpr (eWrkDataType == GDT_UInt8)
    1396             :                     {
    1397             :                         if constexpr (bQuadraticMean)
    1398             :                         {
    1399        5389 :                             iDstPixel = QuadraticMeanByteSSE2OrAVX2(
    1400             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1401             :                                 pDstScanline);
    1402             :                         }
    1403             :                         else
    1404             :                         {
    1405      123976 :                             iDstPixel = AverageByteSSE2OrAVX2(
    1406             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1407             :                                 pDstScanline);
    1408             :                         }
    1409             :                     }
    1410             :                     else
    1411             :                     {
    1412             :                         static_assert(eWrkDataType == GDT_UInt16);
    1413             :                         if constexpr (bQuadraticMean)
    1414             :                         {
    1415          14 :                             iDstPixel = QuadraticMeanUInt16SSE2(
    1416             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1417             :                                 pDstScanline);
    1418             :                         }
    1419             :                         else
    1420             :                         {
    1421          13 :                             iDstPixel = AverageUInt16SSE2(
    1422             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1423             :                                 pDstScanline);
    1424             :                         }
    1425             :                     }
    1426             : #endif
    1427      303851 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1428             :                     {
    1429      174459 :                         Tsum nTotal = 0;
    1430             :                         T nVal;
    1431             :                         if constexpr (bQuadraticMean)
    1432          52 :                             nTotal =
    1433          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[0]) +
    1434          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[1]) +
    1435          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
    1436          52 :                                 SQUARE<Tsum>(
    1437          52 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1438             :                         else
    1439      174407 :                             nTotal = pSrcScanlineShifted[0] +
    1440      174407 :                                      pSrcScanlineShifted[1] +
    1441      174407 :                                      pSrcScanlineShifted[nChunkXSize] +
    1442      174407 :                                      pSrcScanlineShifted[1 + nChunkXSize];
    1443             : 
    1444      174459 :                         constexpr int nTotalWeight = 4;
    1445             :                         if constexpr (bQuadraticMean)
    1446          52 :                             nVal = ComputeIntegerRMS_4values<T>(nTotal);
    1447             :                         else
    1448      174407 :                             nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
    1449             :                                                   nTotalWeight);
    1450             : 
    1451             :                         // No need to compare nVal against tNoDataValue as we
    1452             :                         // are in a case where pabyChunkNodataMask == nullptr
    1453             :                         // implies the absence of nodata value.
    1454      174459 :                         pDstScanline[iDstPixel] = nVal;
    1455      174459 :                         pSrcScanlineShifted += 2;
    1456             :                     }
    1457             :                 }
    1458             :                 else
    1459             :                 {
    1460             :                     static_assert(eWrkDataType == GDT_Float32 ||
    1461             :                                   eWrkDataType == GDT_Float64);
    1462         202 :                     const T *pSrcScanlineShifted =
    1463         202 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1464         202 :                         static_cast<size_t>(nSrcYOff - nChunkYOff) *
    1465         202 :                             nChunkXSize;
    1466         202 :                     int iDstPixel = 0;
    1467             : #if defined(USE_SSE2) && !defined(ARM_V7)
    1468             :                     if constexpr (eWrkDataType == GDT_Float32)
    1469             :                     {
    1470             :                         static_assert(std::is_same_v<T, float>);
    1471             :                         if constexpr (bQuadraticMean)
    1472             :                         {
    1473          66 :                             iDstPixel = QuadraticMeanFloatSSE2(
    1474             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1475             :                                 pDstScanline);
    1476             :                         }
    1477             :                         else
    1478             :                         {
    1479          50 :                             iDstPixel = AverageFloatSSE2(
    1480             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1481             :                                 pDstScanline);
    1482             :                         }
    1483             :                     }
    1484             :                     else
    1485             :                     {
    1486             :                         if constexpr (!bQuadraticMean)
    1487             :                         {
    1488          50 :                             iDstPixel = AverageDoubleSSE2(
    1489             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1490             :                                 pDstScanline);
    1491             :                         }
    1492             :                     }
    1493             : #endif
    1494             : 
    1495         726 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1496             :                     {
    1497             :                         T nVal;
    1498             : 
    1499             :                         if constexpr (bQuadraticMean)
    1500             :                         {
    1501             :                             // Avoid issues with large values by renormalizing
    1502          96 :                             const auto max = std::max(
    1503         420 :                                 {std::fabs(pSrcScanlineShifted[0]),
    1504         420 :                                  std::fabs(pSrcScanlineShifted[1]),
    1505         420 :                                  std::fabs(pSrcScanlineShifted[nChunkXSize]),
    1506         420 :                                  std::fabs(
    1507         420 :                                      pSrcScanlineShifted[1 + nChunkXSize])});
    1508         420 :                             if (max == 0)
    1509             :                             {
    1510           8 :                                 nVal = 0;
    1511             :                             }
    1512         412 :                             else if (std::isinf(max))
    1513             :                             {
    1514             :                                 // If there is at least one infinity value,
    1515             :                                 // then just summing, and taking the abs
    1516             :                                 // value will give the expected result:
    1517             :                                 // * +inf if all values are +inf
    1518             :                                 // * +inf if all values are -inf
    1519             :                                 // * NaN otherwise
    1520          82 :                                 nVal = std::fabs(
    1521          82 :                                     pSrcScanlineShifted[0] +
    1522          82 :                                     pSrcScanlineShifted[1] +
    1523          82 :                                     pSrcScanlineShifted[nChunkXSize] +
    1524          82 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1525             :                             }
    1526             :                             else
    1527             :                             {
    1528         330 :                                 const auto inv_max = static_cast<T>(1.0) / max;
    1529         330 :                                 nVal =
    1530             :                                     max *
    1531         330 :                                     std::sqrt(
    1532             :                                         static_cast<T>(0.25) *
    1533         330 :                                         (SQUARE(pSrcScanlineShifted[0] *
    1534         330 :                                                 inv_max) +
    1535         330 :                                          SQUARE(pSrcScanlineShifted[1] *
    1536         330 :                                                 inv_max) +
    1537         330 :                                          SQUARE(
    1538         330 :                                              pSrcScanlineShifted[nChunkXSize] *
    1539         330 :                                              inv_max) +
    1540         330 :                                          SQUARE(
    1541         330 :                                              pSrcScanlineShifted[1 +
    1542             :                                                                  nChunkXSize] *
    1543             :                                              inv_max)));
    1544             :                             }
    1545             :                         }
    1546             :                         else
    1547             :                         {
    1548         104 :                             constexpr auto weight = static_cast<T>(0.25);
    1549             :                             // Multiply each value by weight to avoid
    1550             :                             // potential overflow
    1551         104 :                             nVal =
    1552         104 :                                 (weight * pSrcScanlineShifted[0] +
    1553         104 :                                  weight * pSrcScanlineShifted[1] +
    1554         104 :                                  weight * pSrcScanlineShifted[nChunkXSize] +
    1555         104 :                                  weight * pSrcScanlineShifted[1 + nChunkXSize]);
    1556             :                         }
    1557             : 
    1558             :                         // No need to compare nVal against tNoDataValue as we
    1559             :                         // are in a case where pabyChunkNodataMask == nullptr
    1560             :                         // implies the absence of nodata value.
    1561         524 :                         pDstScanline[iDstPixel] = nVal;
    1562         524 :                         pSrcScanlineShifted += 2;
    1563             :                     }
    1564      129594 :                 }
    1565             :             }
    1566             :             else
    1567             :             {
    1568          17 :                 const double dfBottomWeight =
    1569      568351 :                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0
    1570      568334 :                                                 : 1.0 - (dfSrcYOff - nSrcYOff);
    1571      568351 :                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
    1572      568351 :                 nSrcYOff -= nChunkYOff;
    1573      568351 :                 nSrcYOff2 -= nChunkYOff;
    1574             : 
    1575      568351 :                 double dfTotalWeightFullColumn = dfBottomWeight;
    1576      568351 :                 if (nSrcYOff + 1 < nSrcYOff2)
    1577             :                 {
    1578      568334 :                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
    1579      568334 :                     dfTotalWeightFullColumn += dfTopWeight;
    1580             :                 }
    1581             : 
    1582     9784185 :                 for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1583             :                 {
    1584     9215839 :                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1585     9215839 :                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1586             : 
    1587     9215839 :                     double dfTotal = 0;
    1588     9215839 :                     double dfTotalWeight = 0;
    1589     9215839 :                     [[maybe_unused]] double dfMulFactor = 1.0;
    1590     9215839 :                     [[maybe_unused]] double dfInvMulFactor = 1.0;
    1591     9215839 :                     constexpr bool bUseMulFactor =
    1592             :                         (eWrkDataType == GDT_Float32 ||
    1593             :                          eWrkDataType == GDT_Float64);
    1594     9215839 :                     if (pabyChunkNodataMask == nullptr)
    1595             :                     {
    1596             :                         if constexpr (bUseMulFactor)
    1597             :                         {
    1598             :                             if constexpr (bQuadraticMean)
    1599             :                             {
    1600          80 :                                 T mulFactor = 0;
    1601          80 :                                 auto pChunkShifted =
    1602          80 :                                     pChunk +
    1603          80 :                                     static_cast<size_t>(nSrcYOff) * nChunkXSize;
    1604             : 
    1605         240 :                                 for (int iY = nSrcYOff; iY < nSrcYOff2;
    1606         160 :                                      ++iY, pChunkShifted += nChunkXSize)
    1607             :                                 {
    1608         480 :                                     for (int iX = nSrcXOff; iX < nSrcXOff2;
    1609             :                                          ++iX)
    1610         640 :                                         mulFactor = std::max(
    1611             :                                             mulFactor,
    1612         320 :                                             std::fabs(pChunkShifted[iX]));
    1613             :                                 }
    1614          80 :                                 dfMulFactor = double(mulFactor);
    1615         142 :                                 dfInvMulFactor =
    1616          62 :                                     dfMulFactor > 0 &&
    1617          62 :                                             std::isfinite(dfMulFactor)
    1618             :                                         ? 1.0 / dfMulFactor
    1619             :                                         : 1.0;
    1620             :                             }
    1621             :                             else
    1622             :                             {
    1623         139 :                                 dfMulFactor = (nSrcYOff2 - nSrcYOff) *
    1624         139 :                                               (nSrcXOff2 - nSrcXOff);
    1625         139 :                                 dfInvMulFactor = 1.0 / dfMulFactor;
    1626             :                             }
    1627             :                         }
    1628             : 
    1629     1746545 :                         auto pChunkShifted =
    1630         227 :                             pChunk +
    1631     1746545 :                             static_cast<size_t>(nSrcYOff) * nChunkXSize;
    1632     1746545 :                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
    1633     1746545 :                         double dfWeightY = dfBottomWeight;
    1634     3493539 :                         while (true)
    1635             :                         {
    1636             :                             double dfTotalLine;
    1637             :                             if constexpr (bQuadraticMean)
    1638             :                             {
    1639             :                                 // Left pixel
    1640             :                                 {
    1641         216 :                                     const T val = pChunkShifted[nSrcXOff];
    1642         216 :                                     dfTotalLine =
    1643         216 :                                         SQUARE(double(val) * dfInvMulFactor) *
    1644         216 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1645             :                                 }
    1646             : 
    1647         216 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1648             :                                 {
    1649             :                                     // Middle pixels
    1650         216 :                                     for (int iX = nSrcXOff + 1;
    1651         536 :                                          iX < nSrcXOff2 - 1; ++iX)
    1652             :                                     {
    1653         320 :                                         const T val = pChunkShifted[iX];
    1654         320 :                                         dfTotalLine += SQUARE(double(val) *
    1655             :                                                               dfInvMulFactor);
    1656             :                                     }
    1657             : 
    1658             :                                     // Right pixel
    1659             :                                     {
    1660         216 :                                         const T val =
    1661         216 :                                             pChunkShifted[nSrcXOff2 - 1];
    1662         216 :                                         dfTotalLine +=
    1663         216 :                                             SQUARE(double(val) *
    1664         216 :                                                    dfInvMulFactor) *
    1665         216 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1666             :                                     }
    1667             :                                 }
    1668             :                             }
    1669             :                             else
    1670             :                             {
    1671             :                                 // Left pixel
    1672             :                                 {
    1673     5239868 :                                     const T val = pChunkShifted[nSrcXOff];
    1674     5239868 :                                     dfTotalLine =
    1675     5239868 :                                         double(val) * dfInvMulFactor *
    1676     5239868 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1677             :                                 }
    1678             : 
    1679     5239868 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1680             :                                 {
    1681             :                                     // Middle pixels
    1682     4239442 :                                     for (int iX = nSrcXOff + 1;
    1683    64183238 :                                          iX < nSrcXOff2 - 1; ++iX)
    1684             :                                     {
    1685    59943836 :                                         const T val = pChunkShifted[iX];
    1686    59943836 :                                         dfTotalLine +=
    1687    59943836 :                                             double(val) * dfInvMulFactor;
    1688             :                                     }
    1689             : 
    1690             :                                     // Right pixel
    1691             :                                     {
    1692     4239442 :                                         const T val =
    1693     4239442 :                                             pChunkShifted[nSrcXOff2 - 1];
    1694     4239442 :                                         dfTotalLine +=
    1695     4239442 :                                             double(val) * dfInvMulFactor *
    1696     4239442 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1697             :                                     }
    1698             :                                 }
    1699             :                             }
    1700             : 
    1701     5240084 :                             dfTotal += dfTotalLine * dfWeightY;
    1702     5240084 :                             --nCounterY;
    1703     5240084 :                             if (nCounterY < 0)
    1704     1746545 :                                 break;
    1705     3493539 :                             pChunkShifted += nChunkXSize;
    1706     3493539 :                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
    1707             :                         }
    1708             : 
    1709     1746545 :                         dfTotalWeight =
    1710     1746545 :                             pasSrcX[iDstPixel].dfTotalWeightFullLine *
    1711             :                             dfTotalWeightFullColumn;
    1712             :                     }
    1713             :                     else
    1714             :                     {
    1715     7469294 :                         size_t nCount = 0;
    1716    30285576 :                         for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1717             :                         {
    1718    22816292 :                             const auto pChunkShifted =
    1719    22816292 :                                 pChunk + static_cast<size_t>(iY) * nChunkXSize;
    1720             : 
    1721    22816292 :                             double dfTotalLine = 0;
    1722    22816292 :                             double dfTotalWeightLine = 0;
    1723             :                             // Left pixel
    1724             :                             {
    1725    22816292 :                                 const int iX = nSrcXOff;
    1726    22816292 :                                 const T val = pChunkShifted[iX];
    1727    22816292 :                                 if (pabyChunkNodataMask
    1728    22816292 :                                         [iX +
    1729    22816292 :                                          static_cast<size_t>(iY) * nChunkXSize])
    1730             :                                 {
    1731    17325139 :                                     nCount++;
    1732    17325139 :                                     const double dfWeightX =
    1733    17325139 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1734    17325139 :                                     dfTotalWeightLine = dfWeightX;
    1735             :                                     if constexpr (bQuadraticMean)
    1736         508 :                                         dfTotalLine =
    1737         508 :                                             SQUARE(double(val)) * dfWeightX;
    1738             :                                     else
    1739    17324631 :                                         dfTotalLine = double(val) * dfWeightX;
    1740             :                                 }
    1741             :                             }
    1742             : 
    1743    22816292 :                             if (nSrcXOff < nSrcXOff2 - 1)
    1744             :                             {
    1745             :                                 // Middle pixels
    1746    61618372 :                                 for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
    1747             :                                      ++iX)
    1748             :                                 {
    1749    38802080 :                                     const T val = pChunkShifted[iX];
    1750    38802080 :                                     if (pabyChunkNodataMask
    1751    38802080 :                                             [iX + static_cast<size_t>(iY) *
    1752    38802080 :                                                       nChunkXSize])
    1753             :                                     {
    1754    28038780 :                                         nCount++;
    1755    28038780 :                                         dfTotalWeightLine += 1;
    1756             :                                         if constexpr (bQuadraticMean)
    1757         640 :                                             dfTotalLine += SQUARE(double(val));
    1758             :                                         else
    1759    28038140 :                                             dfTotalLine += double(val);
    1760             :                                     }
    1761             :                                 }
    1762             : 
    1763             :                                 // Right pixel
    1764             :                                 {
    1765    22816292 :                                     const int iX = nSrcXOff2 - 1;
    1766    22816292 :                                     const T val = pChunkShifted[iX];
    1767    22816292 :                                     if (pabyChunkNodataMask
    1768    22816292 :                                             [iX + static_cast<size_t>(iY) *
    1769    22816292 :                                                       nChunkXSize])
    1770             :                                     {
    1771    17324495 :                                         nCount++;
    1772    17324495 :                                         const double dfWeightX =
    1773    17324495 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1774    17324495 :                                         dfTotalWeightLine += dfWeightX;
    1775             :                                         if constexpr (bQuadraticMean)
    1776         503 :                                             dfTotalLine +=
    1777         503 :                                                 SQUARE(double(val)) * dfWeightX;
    1778             :                                         else
    1779    17323992 :                                             dfTotalLine +=
    1780    17323992 :                                                 double(val) * dfWeightX;
    1781             :                                     }
    1782             :                                 }
    1783             :                             }
    1784             : 
    1785    38163300 :                             const double dfWeightY =
    1786             :                                 (iY == nSrcYOff)        ? dfBottomWeight
    1787    15347008 :                                 : (iY + 1 == nSrcYOff2) ? dfTopWeight
    1788             :                                                         : 1.0;
    1789    22816292 :                             dfTotal += dfTotalLine * dfWeightY;
    1790    22816292 :                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
    1791             :                         }
    1792             : 
    1793     7469294 :                         if (nCount == 0 ||
    1794           8 :                             (bPropagateNoData &&
    1795             :                              nCount <
    1796           8 :                                  static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    1797           8 :                                      (nSrcXOff2 - nSrcXOff)))
    1798             :                         {
    1799     2307682 :                             pDstScanline[iDstPixel] = tNoDataValue;
    1800     2307682 :                             continue;
    1801             :                         }
    1802             :                     }
    1803             :                     if constexpr (eWrkDataType == GDT_UInt8)
    1804             :                     {
    1805             :                         T nVal;
    1806             :                         if constexpr (bQuadraticMean)
    1807          38 :                             nVal = ComputeIntegerRMS<T, int>(dfTotal,
    1808             :                                                              dfTotalWeight);
    1809             :                         else
    1810     6901260 :                             nVal =
    1811     6901260 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1812     6901298 :                         if (bHasNoData && nVal == tNoDataValue)
    1813           0 :                             nVal = tReplacementVal;
    1814     6901298 :                         pDstScanline[iDstPixel] = nVal;
    1815             :                     }
    1816             :                     else if constexpr (eWrkDataType == GDT_UInt16)
    1817             :                     {
    1818             :                         T nVal;
    1819             :                         if constexpr (bQuadraticMean)
    1820           4 :                             nVal = ComputeIntegerRMS<T, uint64_t>(
    1821             :                                 dfTotal, dfTotalWeight);
    1822             :                         else
    1823           4 :                             nVal =
    1824           4 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1825           8 :                         if (bHasNoData && nVal == tNoDataValue)
    1826           0 :                             nVal = tReplacementVal;
    1827           8 :                         pDstScanline[iDstPixel] = nVal;
    1828             :                     }
    1829             :                     else
    1830             :                     {
    1831             :                         T nVal;
    1832             :                         if constexpr (bQuadraticMean)
    1833             :                         {
    1834             :                             if constexpr (bUseMulFactor)
    1835         249 :                                 nVal = static_cast<T>(
    1836         132 :                                     dfMulFactor *
    1837         249 :                                     sqrt(dfTotal / dfTotalWeight));
    1838             :                             else
    1839             :                                 nVal = static_cast<T>(
    1840             :                                     sqrt(dfTotal / dfTotalWeight));
    1841             :                         }
    1842             :                         else
    1843             :                         {
    1844             :                             if constexpr (bUseMulFactor)
    1845        6602 :                                 nVal = static_cast<T>(
    1846        6602 :                                     dfMulFactor * (dfTotal / dfTotalWeight));
    1847             :                             else
    1848             :                                 nVal = static_cast<T>(dfTotal / dfTotalWeight);
    1849             :                         }
    1850        6851 :                         if (bHasNoData && nVal == tNoDataValue)
    1851           2 :                             nVal = tReplacementVal;
    1852        6851 :                         pDstScanline[iDstPixel] = nVal;
    1853             :                     }
    1854             :                 }
    1855             :             }
    1856             :         }
    1857             :         else
    1858             :         {
    1859         115 :             nSrcYOff -= nChunkYOff;
    1860         115 :             nSrcYOff2 -= nChunkYOff;
    1861             : 
    1862        6590 :             for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1863             :             {
    1864        6475 :                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1865        6475 :                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1866             : 
    1867        6475 :                 uint64_t nTotalR = 0;
    1868        6475 :                 uint64_t nTotalG = 0;
    1869        6475 :                 uint64_t nTotalB = 0;
    1870        6475 :                 size_t nCount = 0;
    1871             : 
    1872       19425 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1873             :                 {
    1874       38850 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    1875             :                     {
    1876       25900 :                         const T val =
    1877       25900 :                             pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
    1878             :                         // cppcheck-suppress unsignedLessThanZero
    1879       25900 :                         if (val < 0 || val >= colorEntries.size())
    1880           0 :                             continue;
    1881       25900 :                         const size_t idx = static_cast<size_t>(val);
    1882       25900 :                         const auto &entry = colorEntries[idx];
    1883       25900 :                         if (entry.c4)
    1884             :                         {
    1885             :                             if constexpr (bQuadraticMean)
    1886             :                             {
    1887         800 :                                 nTotalR += SQUARE<int>(entry.c1);
    1888         800 :                                 nTotalG += SQUARE<int>(entry.c2);
    1889         800 :                                 nTotalB += SQUARE<int>(entry.c3);
    1890         800 :                                 ++nCount;
    1891             :                             }
    1892             :                             else
    1893             :                             {
    1894       13328 :                                 nTotalR += entry.c1;
    1895       13328 :                                 nTotalG += entry.c2;
    1896       13328 :                                 nTotalB += entry.c3;
    1897       13328 :                                 ++nCount;
    1898             :                             }
    1899             :                         }
    1900             :                     }
    1901             :                 }
    1902             : 
    1903        6475 :                 if (nCount == 0 ||
    1904           0 :                     (bPropagateNoData &&
    1905           0 :                      nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    1906           0 :                                   (nSrcXOff2 - nSrcXOff)))
    1907             :                 {
    1908        2838 :                     pDstScanline[iDstPixel] = tNoDataValue;
    1909             :                 }
    1910             :                 else
    1911             :                 {
    1912             :                     GDALColorEntry color;
    1913             :                     if constexpr (bQuadraticMean)
    1914             :                     {
    1915         200 :                         color.c1 =
    1916         200 :                             static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
    1917         200 :                         color.c2 =
    1918         200 :                             static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
    1919         200 :                         color.c3 =
    1920         200 :                             static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
    1921             :                     }
    1922             :                     else
    1923             :                     {
    1924        3437 :                         color.c1 =
    1925        3437 :                             static_cast<short>((nTotalR + nCount / 2) / nCount);
    1926        3437 :                         color.c2 =
    1927        3437 :                             static_cast<short>((nTotalG + nCount / 2) / nCount);
    1928        3437 :                         color.c3 =
    1929        3437 :                             static_cast<short>((nTotalB + nCount / 2) / nCount);
    1930             :                     }
    1931        3637 :                     pDstScanline[iDstPixel] =
    1932        3637 :                         static_cast<T>(BestColorEntry(colorEntries, color));
    1933             :                 }
    1934             :             }
    1935             :         }
    1936             :     }
    1937             : 
    1938        7362 :     CPLFree(pasSrcX);
    1939             : 
    1940        7362 :     return CE_None;
    1941             : }
    1942             : 
    1943             : template <bool bQuadraticMean>
    1944             : static CPLErr
    1945        7362 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
    1946             :                                        const void *pChunk, void **ppDstBuffer,
    1947             :                                        GDALDataType *peDstBufferDataType)
    1948             : {
    1949        7362 :     *peDstBufferDataType = args.eWrkDataType;
    1950        7362 :     switch (args.eWrkDataType)
    1951             :     {
    1952        7217 :         case GDT_UInt8:
    1953             :         {
    1954             :             return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_UInt8,
    1955        7217 :                                                     bQuadraticMean>(
    1956        7217 :                 args, static_cast<const GByte *>(pChunk), ppDstBuffer);
    1957             :         }
    1958             : 
    1959          11 :         case GDT_UInt16:
    1960             :         {
    1961             :             if constexpr (bQuadraticMean)
    1962             :             {
    1963             :                 // Use double as accumulation type, because UInt32 could overflow
    1964             :                 return GDALResampleChunk_AverageOrRMS_T<
    1965           6 :                     GUInt16, double, GDT_UInt16, bQuadraticMean>(
    1966           6 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1967             :             }
    1968             :             else
    1969             :             {
    1970             :                 return GDALResampleChunk_AverageOrRMS_T<
    1971           5 :                     GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
    1972           5 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1973             :             }
    1974             :         }
    1975             : 
    1976          81 :         case GDT_Float32:
    1977             :         {
    1978             :             return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
    1979          81 :                                                     bQuadraticMean>(
    1980          81 :                 args, static_cast<const float *>(pChunk), ppDstBuffer);
    1981             :         }
    1982             : 
    1983          53 :         case GDT_Float64:
    1984             :         {
    1985             :             return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
    1986          53 :                                                     bQuadraticMean>(
    1987          53 :                 args, static_cast<const double *>(pChunk), ppDstBuffer);
    1988             :         }
    1989             : 
    1990           0 :         default:
    1991           0 :             break;
    1992             :     }
    1993             : 
    1994           0 :     CPLAssert(false);
    1995             :     return CE_Failure;
    1996             : }
    1997             : 
    1998             : static CPLErr
    1999        7362 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
    2000             :                                const void *pChunk, void **ppDstBuffer,
    2001             :                                GDALDataType *peDstBufferDataType)
    2002             : {
    2003        7362 :     if (EQUAL(args.pszResampling, "RMS"))
    2004          83 :         return GDALResampleChunk_AverageOrRMSInternal<true>(
    2005          83 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    2006             :     else
    2007        7279 :         return GDALResampleChunk_AverageOrRMSInternal<false>(
    2008        7279 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    2009             : }
    2010             : 
    2011             : /************************************************************************/
    2012             : /*                      GDALResampleChunk_Gauss()                       */
    2013             : /************************************************************************/
    2014             : 
    2015          86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
    2016             :                                       const void *pChunk, void **ppDstBuffer,
    2017             :                                       GDALDataType *peDstBufferDataType)
    2018             : 
    2019             : {
    2020          86 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2021          86 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2022          86 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2023          86 :     const int nChunkXOff = args.nChunkXOff;
    2024          86 :     const int nChunkXSize = args.nChunkXSize;
    2025          86 :     const int nChunkYOff = args.nChunkYOff;
    2026          86 :     const int nChunkYSize = args.nChunkYSize;
    2027          86 :     const int nDstXOff = args.nDstXOff;
    2028          86 :     const int nDstXOff2 = args.nDstXOff2;
    2029          86 :     const int nDstYOff = args.nDstYOff;
    2030          86 :     const int nDstYOff2 = args.nDstYOff2;
    2031          86 :     const bool bHasNoData = args.bHasNoData;
    2032          86 :     double dfNoDataValue = args.dfNoDataValue;
    2033          86 :     const GDALColorTable *poColorTable = args.poColorTable;
    2034             : 
    2035          86 :     const double *const padfChunk = static_cast<const double *>(pChunk);
    2036             : 
    2037          86 :     *ppDstBuffer =
    2038          86 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    2039             :                             GDALGetDataTypeSizeBytes(GDT_Float64));
    2040          86 :     if (*ppDstBuffer == nullptr)
    2041             :     {
    2042           0 :         return CE_Failure;
    2043             :     }
    2044          86 :     *peDstBufferDataType = GDT_Float64;
    2045          86 :     double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
    2046             : 
    2047             :     /* -------------------------------------------------------------------- */
    2048             :     /*      Create the filter kernel and allocate scanline buffer.          */
    2049             :     /* -------------------------------------------------------------------- */
    2050          86 :     int nGaussMatrixDim = 3;
    2051             :     const int *panGaussMatrix;
    2052          86 :     constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
    2053          86 :     constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
    2054             :                                         4,  6, 24, 36, 24, 6, 4,  16, 24,
    2055             :                                         16, 4, 1,  4,  6,  4, 1};
    2056          86 :     constexpr int anGaussMatrix7x7[] = {
    2057             :         1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
    2058             :         6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
    2059             :         120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
    2060             :         90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
    2061             : 
    2062          86 :     const int nOXSize = args.nOvrXSize;
    2063          86 :     const int nOYSize = args.nOvrYSize;
    2064          86 :     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    2065             : 
    2066             :     // matrix for gauss filter
    2067          86 :     if (nResYFactor <= 2)
    2068             :     {
    2069          85 :         panGaussMatrix = anGaussMatrix3x3;
    2070          85 :         nGaussMatrixDim = 3;
    2071             :     }
    2072           1 :     else if (nResYFactor <= 4)
    2073             :     {
    2074           0 :         panGaussMatrix = anGaussMatrix5x5;
    2075           0 :         nGaussMatrixDim = 5;
    2076             :     }
    2077             :     else
    2078             :     {
    2079           1 :         panGaussMatrix = anGaussMatrix7x7;
    2080           1 :         nGaussMatrixDim = 7;
    2081             :     }
    2082             : 
    2083             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2084             :     int *panGaussMatrixDup = static_cast<int *>(
    2085             :         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
    2086             :     memcpy(panGaussMatrixDup, panGaussMatrix,
    2087             :            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
    2088             :     panGaussMatrix = panGaussMatrixDup;
    2089             : #endif
    2090             : 
    2091          86 :     if (!bHasNoData)
    2092          79 :         dfNoDataValue = 0.0;
    2093             : 
    2094          86 :     std::vector<GDALColorEntry> colorEntries;
    2095          86 :     int nTransparentIdx = -1;
    2096          86 :     if (poColorTable)
    2097           2 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    2098             : 
    2099             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    2100             :     // it as nodata value.
    2101          92 :     if (bHasNoData && dfNoDataValue >= 0.0 &&
    2102           6 :         dfNoDataValue < colorEntries.size())
    2103           0 :         colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
    2104             : 
    2105             :     // Or if we have no explicit nodata, but a color table entry that is
    2106             :     // transparent, consider it as the nodata value.
    2107          86 :     else if (!bHasNoData && nTransparentIdx >= 0)
    2108             :     {
    2109           0 :         dfNoDataValue = nTransparentIdx;
    2110             :     }
    2111             : 
    2112          86 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2113          86 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2114          86 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    2115             : 
    2116             :     /* ==================================================================== */
    2117             :     /*      Loop over destination scanlines.                                */
    2118             :     /* ==================================================================== */
    2119       16488 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2120             :     {
    2121       16402 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    2122       16402 :         int nSrcYOff2 =
    2123       16402 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
    2124             : 
    2125       16402 :         if (nSrcYOff < nChunkYOff)
    2126             :         {
    2127           0 :             nSrcYOff = nChunkYOff;
    2128           0 :             nSrcYOff2++;
    2129             :         }
    2130             : 
    2131       16402 :         const int iSizeY = nSrcYOff2 - nSrcYOff;
    2132       16402 :         nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
    2133       16402 :         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
    2134             : 
    2135       16402 :         if (nSrcYOff2 > nChunkBottomYOff ||
    2136       16359 :             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
    2137             :         {
    2138          44 :             nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
    2139             :         }
    2140             : 
    2141       16402 :         int nYShiftGaussMatrix = 0;
    2142       16402 :         if (nSrcYOff < nChunkYOff)
    2143             :         {
    2144           0 :             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
    2145           0 :             nSrcYOff = nChunkYOff;
    2146             :         }
    2147             : 
    2148       16402 :         const double *const padfSrcScanline =
    2149       16402 :             padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    2150       16402 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2151       16402 :         if (pabyChunkNodataMask != nullptr)
    2152         152 :             pabySrcScanlineNodataMask =
    2153         152 :                 pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    2154             : 
    2155             :         /* --------------------------------------------------------------------
    2156             :          */
    2157             :         /*      Loop over destination pixels */
    2158             :         /* --------------------------------------------------------------------
    2159             :          */
    2160       16402 :         double *const padfDstScanline =
    2161       16402 :             padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    2162     4149980 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2163             :         {
    2164     4133580 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    2165     4133580 :             int nSrcXOff2 =
    2166     4133580 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
    2167             : 
    2168     4133580 :             if (nSrcXOff < nChunkXOff)
    2169             :             {
    2170           0 :                 nSrcXOff = nChunkXOff;
    2171           0 :                 nSrcXOff2++;
    2172             :             }
    2173             : 
    2174     4133580 :             const int iSizeX = nSrcXOff2 - nSrcXOff;
    2175     4133580 :             nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
    2176     4133580 :             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
    2177             : 
    2178     4133580 :             if (nSrcXOff2 > nChunkRightXOff ||
    2179     4127930 :                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
    2180             :             {
    2181        5650 :                 nSrcXOff2 =
    2182        5650 :                     std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
    2183             :             }
    2184             : 
    2185     4133580 :             int nXShiftGaussMatrix = 0;
    2186     4133580 :             if (nSrcXOff < nChunkXOff)
    2187             :             {
    2188           0 :                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
    2189           0 :                 nSrcXOff = nChunkXOff;
    2190             :             }
    2191             : 
    2192     4133580 :             if (poColorTable == nullptr)
    2193             :             {
    2194     4133380 :                 double dfTotal = 0.0;
    2195     4133380 :                 GInt64 nCount = 0;
    2196     4133380 :                 const int *panLineWeight =
    2197     4133380 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2198             :                     nXShiftGaussMatrix;
    2199             : 
    2200    16527900 :                 for (int iY = nSrcYOff; iY < nSrcYOff2;
    2201    12394500 :                      ++iY, panLineWeight += nGaussMatrixDim)
    2202             :                 {
    2203    49561300 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2204             :                     {
    2205    37166800 :                         const double val =
    2206    37166800 :                             padfSrcScanline[iX - nChunkXOff +
    2207    37166800 :                                             static_cast<GPtrDiff_t>(iY -
    2208    37166800 :                                                                     nSrcYOff) *
    2209    37166800 :                                                 nChunkXSize];
    2210    37166800 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2211       32872 :                             pabySrcScanlineNodataMask[iX - nChunkXOff +
    2212       32872 :                                                       static_cast<GPtrDiff_t>(
    2213       32872 :                                                           iY - nSrcYOff) *
    2214       32872 :                                                           nChunkXSize])
    2215             :                         {
    2216    37146100 :                             const int nWeight = panLineWeight[i];
    2217    37146100 :                             dfTotal += val * nWeight;
    2218    37146100 :                             nCount += nWeight;
    2219             :                         }
    2220             :                     }
    2221             :                 }
    2222             : 
    2223     4133380 :                 if (nCount == 0)
    2224             :                 {
    2225        2217 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2226             :                 }
    2227             :                 else
    2228             :                 {
    2229     4131160 :                     padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
    2230             :                 }
    2231             :             }
    2232             :             else
    2233             :             {
    2234         200 :                 GInt64 nTotalR = 0;
    2235         200 :                 GInt64 nTotalG = 0;
    2236         200 :                 GInt64 nTotalB = 0;
    2237         200 :                 GInt64 nTotalWeight = 0;
    2238         200 :                 const int *panLineWeight =
    2239         200 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2240             :                     nXShiftGaussMatrix;
    2241             : 
    2242         780 :                 for (int iY = nSrcYOff; iY < nSrcYOff2;
    2243         580 :                      ++iY, panLineWeight += nGaussMatrixDim)
    2244             :                 {
    2245        2262 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2246             :                     {
    2247        1682 :                         const double val =
    2248        1682 :                             padfSrcScanline[iX - nChunkXOff +
    2249        1682 :                                             static_cast<GPtrDiff_t>(iY -
    2250        1682 :                                                                     nSrcYOff) *
    2251        1682 :                                                 nChunkXSize];
    2252        1682 :                         if (val < 0 || val >= colorEntries.size())
    2253           0 :                             continue;
    2254             : 
    2255        1682 :                         size_t idx = static_cast<size_t>(val);
    2256        1682 :                         if (colorEntries[idx].c4)
    2257             :                         {
    2258        1682 :                             const int nWeight = panLineWeight[i];
    2259        1682 :                             nTotalR +=
    2260        1682 :                                 static_cast<GInt64>(colorEntries[idx].c1) *
    2261        1682 :                                 nWeight;
    2262        1682 :                             nTotalG +=
    2263        1682 :                                 static_cast<GInt64>(colorEntries[idx].c2) *
    2264        1682 :                                 nWeight;
    2265        1682 :                             nTotalB +=
    2266        1682 :                                 static_cast<GInt64>(colorEntries[idx].c3) *
    2267        1682 :                                 nWeight;
    2268        1682 :                             nTotalWeight += nWeight;
    2269             :                         }
    2270             :                     }
    2271             :                 }
    2272             : 
    2273         200 :                 if (nTotalWeight == 0)
    2274             :                 {
    2275           0 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2276             :                 }
    2277             :                 else
    2278             :                 {
    2279             :                     GDALColorEntry color;
    2280             : 
    2281         200 :                     color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
    2282             :                                                   nTotalWeight);
    2283         200 :                     color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
    2284             :                                                   nTotalWeight);
    2285         200 :                     color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
    2286             :                                                   nTotalWeight);
    2287         200 :                     padfDstScanline[iDstPixel - nDstXOff] =
    2288         200 :                         BestColorEntry(colorEntries, color);
    2289             :                 }
    2290             :             }
    2291             :         }
    2292             :     }
    2293             : 
    2294             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2295             :     CPLFree(panGaussMatrixDup);
    2296             : #endif
    2297             : 
    2298          86 :     return CE_None;
    2299             : }
    2300             : 
    2301             : /************************************************************************/
    2302             : /*                       GDALResampleChunk_Mode()                       */
    2303             : /************************************************************************/
    2304             : 
    2305         688 : template <class T> static inline bool IsSame(T a, T b)
    2306             : {
    2307         688 :     return a == b;
    2308             : }
    2309             : 
    2310          60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
    2311             : {
    2312          60 :     return a == b || (CPLIsNan(a) && CPLIsNan(b));
    2313             : }
    2314             : 
    2315        5583 : template <> bool IsSame<float>(float a, float b)
    2316             : {
    2317        5583 :     return a == b || (std::isnan(a) && std::isnan(b));
    2318             : }
    2319             : 
    2320        1701 : template <> bool IsSame<double>(double a, double b)
    2321             : {
    2322        1701 :     return a == b || (std::isnan(a) && std::isnan(b));
    2323             : }
    2324             : 
    2325             : namespace
    2326             : {
    2327             : struct ComplexFloat16
    2328             : {
    2329             :     GFloat16 r;
    2330             :     GFloat16 i;
    2331             : };
    2332             : }  // namespace
    2333             : 
    2334          60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
    2335             : {
    2336          90 :     return (a.r == b.r && a.i == b.i) ||
    2337          90 :            (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
    2338             : }
    2339             : 
    2340             : template <>
    2341          60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
    2342             : {
    2343         120 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2344         120 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2345             : }
    2346             : 
    2347             : template <>
    2348          60 : bool IsSame<std::complex<double>>(std::complex<double> a,
    2349             :                                   std::complex<double> b)
    2350             : {
    2351         120 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2352         120 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2353             : }
    2354             : 
    2355             : template <class T>
    2356         182 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
    2357             :                                       const T *pChunk, T *const pDstBuffer)
    2358             : 
    2359             : {
    2360         182 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2361         182 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2362         182 :     const double dfSrcXDelta = args.dfSrcXDelta;
    2363         182 :     const double dfSrcYDelta = args.dfSrcYDelta;
    2364         182 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2365         182 :     const int nChunkXOff = args.nChunkXOff;
    2366         182 :     const int nChunkXSize = args.nChunkXSize;
    2367         182 :     const int nChunkYOff = args.nChunkYOff;
    2368         182 :     const int nChunkYSize = args.nChunkYSize;
    2369         182 :     const int nDstXOff = args.nDstXOff;
    2370         182 :     const int nDstXOff2 = args.nDstXOff2;
    2371         182 :     const int nDstYOff = args.nDstYOff;
    2372         182 :     const int nDstYOff2 = args.nDstYOff2;
    2373         182 :     const bool bHasNoData = args.bHasNoData;
    2374         182 :     const GDALColorTable *poColorTable = args.poColorTable;
    2375         182 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    2376             : 
    2377           8 :     T tNoDataValue;
    2378             :     if constexpr (std::is_same<T, ComplexFloat16>::value)
    2379             :     {
    2380           4 :         tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
    2381           4 :         tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
    2382             :     }
    2383             :     else if constexpr (std::is_same<T, std::complex<float>>::value ||
    2384             :                        std::is_same<T, std::complex<double>>::value)
    2385             :     {
    2386             :         using BaseT = typename T::value_type;
    2387           8 :         tNoDataValue =
    2388             :             std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
    2389             :                                 std::numeric_limits<BaseT>::quiet_NaN());
    2390             :     }
    2391         170 :     else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
    2392         169 :         tNoDataValue = 0;
    2393             :     else
    2394           1 :         tNoDataValue = static_cast<T>(args.dfNoDataValue);
    2395             : 
    2396             :     using CountType = uint32_t;
    2397         182 :     CountType nMaxNumPx = 0;
    2398         182 :     T *paVals = nullptr;
    2399         182 :     CountType *panCounts = nullptr;
    2400             : 
    2401         182 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2402         182 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2403         364 :     std::vector<int> anVals(256, 0);
    2404             : 
    2405             :     /* ==================================================================== */
    2406             :     /*      Loop over destination scanlines.                                */
    2407             :     /* ==================================================================== */
    2408        7713 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2409             :     {
    2410        7531 :         const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    2411        7531 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    2412             : #ifdef only_pixels_with_more_than_10_pct_participation
    2413             :         // When oversampling, don't take into account pixels that have a tiny
    2414             :         // participation in the resulting pixel
    2415             :         if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
    2416             :             nSrcYOff < nChunkBottomYOff)
    2417             :             nSrcYOff++;
    2418             : #endif
    2419        7531 :         if (nSrcYOff < nChunkYOff)
    2420           0 :             nSrcYOff = nChunkYOff;
    2421             : 
    2422        7531 :         const double dfSrcYOff2 =
    2423        7531 :             dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    2424        7531 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    2425             : #ifdef only_pixels_with_more_than_10_pct_participation
    2426             :         // When oversampling, don't take into account pixels that have a tiny
    2427             :         // participation in the resulting pixel
    2428             :         if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
    2429             :             nSrcYOff2 > nChunkYOff)
    2430             :             nSrcYOff2--;
    2431             : #endif
    2432        7531 :         if (nSrcYOff2 == nSrcYOff)
    2433           0 :             ++nSrcYOff2;
    2434        7531 :         if (nSrcYOff2 > nChunkBottomYOff)
    2435           0 :             nSrcYOff2 = nChunkBottomYOff;
    2436             : 
    2437        7531 :         const T *const paSrcScanline =
    2438         281 :             pChunk +
    2439        7531 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
    2440        7531 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2441        7531 :         if (pabyChunkNodataMask != nullptr)
    2442        1838 :             pabySrcScanlineNodataMask =
    2443             :                 pabyChunkNodataMask +
    2444        1838 :                 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
    2445             : 
    2446        7531 :         T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
    2447             :         /* --------------------------------------------------------------------
    2448             :          */
    2449             :         /*      Loop over destination pixels */
    2450             :         /* --------------------------------------------------------------------
    2451             :          */
    2452     4260596 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2453             :         {
    2454     4253061 :             const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    2455             :             // Apply some epsilon to avoid numerical precision issues
    2456     4253061 :             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    2457             : #ifdef only_pixels_with_more_than_10_pct_participation
    2458             :             // When oversampling, don't take into account pixels that have a
    2459             :             // tiny participation in the resulting pixel
    2460             :             if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
    2461             :                 nSrcXOff < nChunkRightXOff)
    2462             :                 nSrcXOff++;
    2463             : #endif
    2464     4253061 :             if (nSrcXOff < nChunkXOff)
    2465           0 :                 nSrcXOff = nChunkXOff;
    2466             : 
    2467     4253061 :             const double dfSrcXOff2 =
    2468     4253061 :                 dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    2469     4253061 :             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    2470             : #ifdef only_pixels_with_more_than_10_pct_participation
    2471             :             // When oversampling, don't take into account pixels that have a
    2472             :             // tiny participation in the resulting pixel
    2473             :             if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
    2474             :                 nSrcXOff2 > nChunkXOff)
    2475             :                 nSrcXOff2--;
    2476             : #endif
    2477     4253061 :             if (nSrcXOff2 == nSrcXOff)
    2478           0 :                 nSrcXOff2++;
    2479     4253061 :             if (nSrcXOff2 > nChunkRightXOff)
    2480           0 :                 nSrcXOff2 = nChunkRightXOff;
    2481             : 
    2482     4253061 :             bool bRegularProcessing = false;
    2483             :             if constexpr (!std::is_same<T, GByte>::value)
    2484        1671 :                 bRegularProcessing = true;
    2485     4251390 :             else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
    2486           0 :                 bRegularProcessing = true;
    2487             : 
    2488     4253061 :             if (bRegularProcessing)
    2489             :             {
    2490             :                 // Sanity check to make sure the allocation of paVals and
    2491             :                 // panCounts don't overflow.
    2492             :                 static_assert(sizeof(CountType) <= sizeof(size_t));
    2493        3342 :                 if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
    2494        1671 :                     static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
    2495        1671 :                         (std::numeric_limits<CountType>::max() /
    2496        3342 :                          std::max(sizeof(T), sizeof(CountType))) /
    2497        1671 :                             static_cast<CountType>(nSrcXOff2 - nSrcXOff))
    2498             :                 {
    2499           0 :                     CPLError(CE_Failure, CPLE_NotSupported,
    2500             :                              "Too big downsampling factor");
    2501           0 :                     CPLFree(paVals);
    2502           0 :                     CPLFree(panCounts);
    2503           0 :                     return CE_Failure;
    2504             :                 }
    2505        1671 :                 const CountType nNumPx =
    2506        1671 :                     static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
    2507        1671 :                     (nSrcXOff2 - nSrcXOff);
    2508        1671 :                 CountType iMaxInd = 0;
    2509        1671 :                 CountType iMaxVal = 0;
    2510             : 
    2511        1671 :                 if (paVals == nullptr || nNumPx > nMaxNumPx)
    2512             :                 {
    2513             :                     T *paValsNew = static_cast<T *>(
    2514         116 :                         VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
    2515             :                     CountType *panCountsNew =
    2516         116 :                         static_cast<CountType *>(VSI_REALLOC_VERBOSE(
    2517             :                             panCounts, nNumPx * sizeof(CountType)));
    2518         116 :                     if (paValsNew != nullptr)
    2519         116 :                         paVals = paValsNew;
    2520         116 :                     if (panCountsNew != nullptr)
    2521         116 :                         panCounts = panCountsNew;
    2522         116 :                     if (paValsNew == nullptr || panCountsNew == nullptr)
    2523             :                     {
    2524           0 :                         CPLFree(paVals);
    2525           0 :                         CPLFree(panCounts);
    2526           0 :                         return CE_Failure;
    2527             :                     }
    2528         116 :                     nMaxNumPx = nNumPx;
    2529             :                 }
    2530             : 
    2531        5245 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2532             :                 {
    2533        3574 :                     const GPtrDiff_t iTotYOff =
    2534        3574 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2535        3574 :                         nChunkXOff;
    2536       11842 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2537             :                     {
    2538        8268 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2539        1552 :                             pabySrcScanlineNodataMask[iX + iTotYOff])
    2540             :                         {
    2541        8247 :                             const T val = paSrcScanline[iX + iTotYOff];
    2542        8247 :                             CountType i = 0;  // Used after for.
    2543             : 
    2544             :                             // Check array for existing entry.
    2545       11611 :                             for (; i < iMaxInd; ++i)
    2546             :                             {
    2547        8212 :                                 if (IsSame(paVals[i], val))
    2548             :                                 {
    2549        4848 :                                     if (++panCounts[i] > panCounts[iMaxVal])
    2550             :                                     {
    2551         246 :                                         iMaxVal = i;
    2552             :                                     }
    2553        4848 :                                     break;
    2554             :                                 }
    2555             :                             }
    2556             : 
    2557             :                             // Add to arr if entry not already there.
    2558        8247 :                             if (i == iMaxInd)
    2559             :                             {
    2560        3399 :                                 paVals[iMaxInd] = val;
    2561        3399 :                                 panCounts[iMaxInd] = 1;
    2562             : 
    2563        3399 :                                 if (iMaxInd == 0)
    2564             :                                 {
    2565        1668 :                                     iMaxVal = iMaxInd;
    2566             :                                 }
    2567             : 
    2568        3399 :                                 ++iMaxInd;
    2569             :                             }
    2570             :                         }
    2571             :                     }
    2572             :                 }
    2573             : 
    2574        1671 :                 if (iMaxInd == 0)
    2575           3 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2576             :                 else
    2577        1668 :                     paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
    2578             :             }
    2579             :             else if constexpr (std::is_same<T, GByte>::value)
    2580             :             // ( eSrcDataType == GDT_UInt8 && nEntryCount < 256 )
    2581             :             {
    2582             :                 // So we go here for a paletted or non-paletted byte band.
    2583             :                 // The input values are then between 0 and 255.
    2584     4251390 :                 int nMaxVal = 0;
    2585     4251390 :                 int iMaxInd = -1;
    2586             : 
    2587             :                 // The cost of this zeroing might be high. Perhaps we should
    2588             :                 // just use the above generic case, and go to this one if the
    2589             :                 // number of source pixels is large enough
    2590     4251390 :                 std::fill(anVals.begin(), anVals.end(), 0);
    2591             : 
    2592    12777800 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2593             :                 {
    2594     8526440 :                     const GPtrDiff_t iTotYOff =
    2595     8526440 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2596     8526440 :                         nChunkXOff;
    2597    25649600 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2598             :                     {
    2599    17123100 :                         const T val = paSrcScanline[iX + iTotYOff];
    2600    17123100 :                         if (!bHasNoData || val != tNoDataValue)
    2601             :                         {
    2602    17123100 :                             int nVal = static_cast<int>(val);
    2603    17123100 :                             if (++anVals[nVal] > nMaxVal)
    2604             :                             {
    2605             :                                 // Sum the density.
    2606             :                                 // Is it the most common value so far?
    2607    17006400 :                                 iMaxInd = nVal;
    2608    17006400 :                                 nMaxVal = anVals[nVal];
    2609             :                             }
    2610             :                         }
    2611             :                     }
    2612             :                 }
    2613             : 
    2614     4251390 :                 if (iMaxInd == -1)
    2615           0 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2616             :                 else
    2617     4251390 :                     paDstScanline[iDstPixel - nDstXOff] =
    2618             :                         static_cast<T>(iMaxInd);
    2619             :             }
    2620             :         }
    2621             :     }
    2622             : 
    2623         182 :     CPLFree(paVals);
    2624         182 :     CPLFree(panCounts);
    2625             : 
    2626         182 :     return CE_None;
    2627             : }
    2628             : 
    2629         182 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
    2630             :                                      const void *pChunk, void **ppDstBuffer,
    2631             :                                      GDALDataType *peDstBufferDataType)
    2632             : {
    2633         182 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    2634             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    2635             :         GDALGetDataTypeSizeBytes(args.eWrkDataType));
    2636         182 :     if (*ppDstBuffer == nullptr)
    2637             :     {
    2638           0 :         return CE_Failure;
    2639             :     }
    2640             : 
    2641         182 :     CPLAssert(args.eSrcDataType == args.eWrkDataType);
    2642             : 
    2643         182 :     *peDstBufferDataType = args.eWrkDataType;
    2644         182 :     switch (args.eWrkDataType)
    2645             :     {
    2646             :         // For mode resampling, as no computation is done, only the
    2647             :         // size of the data type matters... except for Byte where we have
    2648             :         // special processing. And for floating point values
    2649          66 :         case GDT_UInt8:
    2650             :         {
    2651          66 :             return GDALResampleChunk_ModeT(args,
    2652             :                                            static_cast<const GByte *>(pChunk),
    2653          66 :                                            static_cast<GByte *>(*ppDstBuffer));
    2654             :         }
    2655             : 
    2656           4 :         case GDT_Int8:
    2657             :         {
    2658           4 :             return GDALResampleChunk_ModeT(args,
    2659             :                                            static_cast<const int8_t *>(pChunk),
    2660           4 :                                            static_cast<int8_t *>(*ppDstBuffer));
    2661             :         }
    2662             : 
    2663          10 :         case GDT_Int16:
    2664             :         case GDT_UInt16:
    2665             :         {
    2666          10 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
    2667          10 :             return GDALResampleChunk_ModeT(
    2668             :                 args, static_cast<const uint16_t *>(pChunk),
    2669          10 :                 static_cast<uint16_t *>(*ppDstBuffer));
    2670             :         }
    2671             : 
    2672          15 :         case GDT_CInt16:
    2673             :         case GDT_Int32:
    2674             :         case GDT_UInt32:
    2675             :         {
    2676          15 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2677          15 :             return GDALResampleChunk_ModeT(
    2678             :                 args, static_cast<const uint32_t *>(pChunk),
    2679          15 :                 static_cast<uint32_t *>(*ppDstBuffer));
    2680             :         }
    2681             : 
    2682          12 :         case GDT_CInt32:
    2683             :         case GDT_Int64:
    2684             :         case GDT_UInt64:
    2685             :         {
    2686          12 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2687          12 :             return GDALResampleChunk_ModeT(
    2688             :                 args, static_cast<const uint64_t *>(pChunk),
    2689          12 :                 static_cast<uint64_t *>(*ppDstBuffer));
    2690             :         }
    2691             : 
    2692           4 :         case GDT_Float16:
    2693             :         {
    2694           4 :             return GDALResampleChunk_ModeT(
    2695             :                 args, static_cast<const GFloat16 *>(pChunk),
    2696           4 :                 static_cast<GFloat16 *>(*ppDstBuffer));
    2697             :         }
    2698             : 
    2699          35 :         case GDT_Float32:
    2700             :         {
    2701          35 :             return GDALResampleChunk_ModeT(args,
    2702             :                                            static_cast<const float *>(pChunk),
    2703          35 :                                            static_cast<float *>(*ppDstBuffer));
    2704             :         }
    2705             : 
    2706          24 :         case GDT_Float64:
    2707             :         {
    2708          24 :             return GDALResampleChunk_ModeT(args,
    2709             :                                            static_cast<const double *>(pChunk),
    2710          24 :                                            static_cast<double *>(*ppDstBuffer));
    2711             :         }
    2712             : 
    2713           4 :         case GDT_CFloat16:
    2714             :         {
    2715           4 :             return GDALResampleChunk_ModeT(
    2716             :                 args, static_cast<const ComplexFloat16 *>(pChunk),
    2717           4 :                 static_cast<ComplexFloat16 *>(*ppDstBuffer));
    2718             :         }
    2719             : 
    2720           4 :         case GDT_CFloat32:
    2721             :         {
    2722           4 :             return GDALResampleChunk_ModeT(
    2723             :                 args, static_cast<const std::complex<float> *>(pChunk),
    2724           4 :                 static_cast<std::complex<float> *>(*ppDstBuffer));
    2725             :         }
    2726             : 
    2727           4 :         case GDT_CFloat64:
    2728             :         {
    2729           4 :             return GDALResampleChunk_ModeT(
    2730             :                 args, static_cast<const std::complex<double> *>(pChunk),
    2731           4 :                 static_cast<std::complex<double> *>(*ppDstBuffer));
    2732             :         }
    2733             : 
    2734           0 :         case GDT_Unknown:
    2735             :         case GDT_TypeCount:
    2736           0 :             break;
    2737             :     }
    2738             : 
    2739           0 :     CPLAssert(false);
    2740             :     return CE_Failure;
    2741             : }
    2742             : 
    2743             : /************************************************************************/
    2744             : /*                 GDALResampleConvolutionHorizontal()                  */
    2745             : /************************************************************************/
    2746             : 
    2747             : template <class T>
    2748             : static inline double
    2749       46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
    2750             :                                   int nSrcPixelCount)
    2751             : {
    2752       46038 :     double dfVal1 = 0.0;
    2753       46038 :     double dfVal2 = 0.0;
    2754       46038 :     int i = 0;  // Used after for.
    2755             :     // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
    2756             :     // manually (untypical) unrolled loop in -O2 and -O3:
    2757             :     // https://github.com/OSGeo/gdal/issues/9508
    2758             : #if !defined(__INTEL_CLANG_COMPILER)
    2759       92396 :     for (; i < nSrcPixelCount - 3; i += 4)
    2760             :     {
    2761       46358 :         dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
    2762       46358 :         dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
    2763       46358 :         dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
    2764       46358 :         dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
    2765             :     }
    2766             : #endif
    2767       48662 :     for (; i < nSrcPixelCount; ++i)
    2768             :     {
    2769        2624 :         dfVal1 += double(pChunk[i]) * padfWeights[i];
    2770             :     }
    2771       46038 :     return dfVal1 + dfVal2;
    2772             : }
    2773             : 
    2774             : template <class T, bool bHasNaN>
    2775       46368 : static inline void GDALResampleConvolutionHorizontalWithMask(
    2776             :     const T *pChunk, const GByte *pabyMask, const double *padfWeights,
    2777             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2778             : {
    2779       46368 :     dfVal = 0;
    2780       46368 :     dfWeightSum = 0;
    2781       46368 :     int i = 0;
    2782      103804 :     for (; i < nSrcPixelCount - 3; i += 4)
    2783             :     {
    2784       57436 :         double dfWeight0 = padfWeights[i + 0] * pabyMask[i + 0];
    2785       57436 :         double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
    2786       57436 :         double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
    2787       57436 :         double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
    2788             : 
    2789      229744 :         const auto MulNaNAware = [](double v, double &w, double &val)
    2790             :         {
    2791             :             if constexpr (bHasNaN)
    2792             :             {
    2793       14848 :                 if (std::isnan(v))
    2794             :                 {
    2795          76 :                     w = 0;
    2796          76 :                     return;
    2797             :                 }
    2798             :             }
    2799       14772 :             val += v * w;
    2800             :         };
    2801             : 
    2802       57436 :         MulNaNAware(double(pChunk[i + 0]), dfWeight0, dfVal);
    2803       57436 :         MulNaNAware(double(pChunk[i + 1]), dfWeight1, dfVal);
    2804       57436 :         MulNaNAware(double(pChunk[i + 2]), dfWeight2, dfVal);
    2805       57436 :         MulNaNAware(double(pChunk[i + 3]), dfWeight3, dfVal);
    2806       57436 :         dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
    2807             :     }
    2808       64874 :     for (; i < nSrcPixelCount; ++i)
    2809             :     {
    2810       18506 :         const double dfWeight = padfWeights[i] * pabyMask[i];
    2811             :         if constexpr (bHasNaN)
    2812             :         {
    2813        1920 :             if (!std::isnan(pChunk[i]))
    2814             :             {
    2815        1920 :                 dfVal += double(pChunk[i]) * dfWeight;
    2816        1920 :                 dfWeightSum += dfWeight;
    2817             :             }
    2818             :         }
    2819             :         else
    2820             :         {
    2821       16586 :             dfVal += double(pChunk[i]) * dfWeight;
    2822       16586 :             dfWeightSum += dfWeight;
    2823             :         }
    2824             :     }
    2825       46368 : }
    2826             : 
    2827             : template <class T, bool bHasNaN>
    2828     1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
    2829             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2830             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2831             :     double &dfRes2, double &dfRes3)
    2832             : {
    2833     1341366 :     double dfVal1 = 0.0;
    2834     1341366 :     double dfVal2 = 0.0;
    2835     1341366 :     double dfVal3 = 0.0;
    2836     1341366 :     double dfVal4 = 0.0;
    2837     1341366 :     double dfVal5 = 0.0;
    2838     1341366 :     double dfVal6 = 0.0;
    2839     1341366 :     int i = 0;  // Used after for.
    2840             : 
    2841    16866840 :     const auto MulNaNAware = [](double a, double w)
    2842             :     {
    2843             :         if constexpr (bHasNaN)
    2844             :         {
    2845           0 :             if (std::isnan(a))
    2846           0 :                 return 0.0;
    2847             :         }
    2848    16866900 :         return a * w;
    2849             :     };
    2850             : 
    2851     2736937 :     for (; i < nSrcPixelCount - 3; i += 4)
    2852             :     {
    2853     1395570 :         dfVal1 += MulNaNAware(double(pChunkRow1[i + 0]), padfWeights[i + 0]);
    2854     1395570 :         dfVal1 += MulNaNAware(double(pChunkRow1[i + 1]), padfWeights[i + 1]);
    2855     1395570 :         dfVal2 += MulNaNAware(double(pChunkRow1[i + 2]), padfWeights[i + 2]);
    2856     1395570 :         dfVal2 += MulNaNAware(double(pChunkRow1[i + 3]), padfWeights[i + 3]);
    2857     1395570 :         dfVal3 += MulNaNAware(double(pChunkRow2[i + 0]), padfWeights[i + 0]);
    2858     1395570 :         dfVal3 += MulNaNAware(double(pChunkRow2[i + 1]), padfWeights[i + 1]);
    2859     1395570 :         dfVal4 += MulNaNAware(double(pChunkRow2[i + 2]), padfWeights[i + 2]);
    2860     1395570 :         dfVal4 += MulNaNAware(double(pChunkRow2[i + 3]), padfWeights[i + 3]);
    2861     1395570 :         dfVal5 += MulNaNAware(double(pChunkRow3[i + 0]), padfWeights[i + 0]);
    2862     1395570 :         dfVal5 += MulNaNAware(double(pChunkRow3[i + 1]), padfWeights[i + 1]);
    2863     1395570 :         dfVal6 += MulNaNAware(double(pChunkRow3[i + 2]), padfWeights[i + 2]);
    2864     1395570 :         dfVal6 += MulNaNAware(double(pChunkRow3[i + 3]), padfWeights[i + 3]);
    2865             :     }
    2866     1381377 :     for (; i < nSrcPixelCount; ++i)
    2867             :     {
    2868       40011 :         dfVal1 += MulNaNAware(double(pChunkRow1[i]), padfWeights[i]);
    2869       40011 :         dfVal3 += MulNaNAware(double(pChunkRow2[i]), padfWeights[i]);
    2870       40011 :         dfVal5 += MulNaNAware(double(pChunkRow3[i]), padfWeights[i]);
    2871             :     }
    2872     1341366 :     dfRes1 = dfVal1 + dfVal2;
    2873     1341366 :     dfRes2 = dfVal3 + dfVal4;
    2874     1341366 :     dfRes3 = dfVal5 + dfVal6;
    2875     1341366 : }
    2876             : 
    2877             : template <class T, bool bHasNaN>
    2878       18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    2879             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2880             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2881             :     double &dfRes2, double &dfRes3)
    2882             : {
    2883       18980 :     GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
    2884             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, nSrcPixelCount, dfRes1,
    2885             :         dfRes2, dfRes3);
    2886       18980 : }
    2887             : 
    2888             : template <class T, bool bHasNaN>
    2889     1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
    2890             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2891             :     const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
    2892             : {
    2893     1256690 :     GDALResampleConvolutionHorizontal_3rows<T, bHasNaN>(
    2894             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeights, 4, dfRes1, dfRes2,
    2895             :         dfRes3);
    2896     1256690 : }
    2897             : 
    2898             : /************************************************************************/
    2899             : /*                  GDALResampleConvolutionVertical()                   */
    2900             : /************************************************************************/
    2901             : 
    2902             : template <class T>
    2903             : static inline double
    2904      471387 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
    2905             :                                 const double *padfWeights, int nSrcLineCount)
    2906             : {
    2907      471387 :     double dfVal1 = 0.0;
    2908      471387 :     double dfVal2 = 0.0;
    2909      471387 :     int i = 0;
    2910      471387 :     size_t j = 0;
    2911      933894 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2912             :     {
    2913      462507 :         dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
    2914      462507 :         dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
    2915      462507 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2916      462507 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2917             :     }
    2918      525654 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2919             :     {
    2920       54267 :         dfVal1 += pChunk[j] * padfWeights[i];
    2921             :     }
    2922      471387 :     return dfVal1 + dfVal2;
    2923             : }
    2924             : 
    2925             : template <class T>
    2926     2930610 : static inline void GDALResampleConvolutionVertical_2cols(
    2927             :     const T *pChunk, size_t nStride, const double *padfWeights,
    2928             :     int nSrcLineCount, double &dfRes1, double &dfRes2)
    2929             : {
    2930     2930610 :     double dfVal1 = 0.0;
    2931     2930610 :     double dfVal2 = 0.0;
    2932     2930610 :     double dfVal3 = 0.0;
    2933     2930610 :     double dfVal4 = 0.0;
    2934     2930610 :     int i = 0;
    2935     2930610 :     size_t j = 0;
    2936     5863170 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2937             :     {
    2938     2932560 :         dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
    2939     2932560 :         dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
    2940     2932560 :         dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
    2941     2932560 :         dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
    2942     2932560 :         dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
    2943     2932560 :         dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
    2944     2932560 :         dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
    2945     2932560 :         dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
    2946             :     }
    2947     3053490 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2948             :     {
    2949      122880 :         dfVal1 += pChunk[j + 0] * padfWeights[i];
    2950      122880 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2951             :     }
    2952     2930610 :     dfRes1 = dfVal1 + dfVal2;
    2953     2930610 :     dfRes2 = dfVal3 + dfVal4;
    2954     2930610 : }
    2955             : 
    2956             : #ifdef USE_SSE2
    2957             : 
    2958             : #ifdef __AVX__
    2959             : /************************************************************************/
    2960             : /*              GDALResampleConvolutionVertical_16cols<T>               */
    2961             : /************************************************************************/
    2962             : 
    2963             : template <class T>
    2964             : static inline void
    2965             : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
    2966             :                                        const double *padfWeights,
    2967             :                                        int nSrcLineCount, float *afDest)
    2968             : {
    2969             :     int i = 0;
    2970             :     size_t j = 0;
    2971             :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2972             :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2973             :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2974             :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2975             :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2976             :     {
    2977             :         XMMReg4Double w0 =
    2978             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2979             :         XMMReg4Double w1 =
    2980             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2981             :         XMMReg4Double w2 =
    2982             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2983             :         XMMReg4Double w3 =
    2984             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2985             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2986             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2987             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
    2988             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
    2989             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2990             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2991             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
    2992             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
    2993             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2994             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2995             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
    2996             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
    2997             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2998             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2999             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
    3000             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
    3001             :     }
    3002             :     for (; i < nSrcLineCount; ++i, j += nStride)
    3003             :     {
    3004             :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    3005             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    3006             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    3007             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
    3008             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
    3009             :     }
    3010             :     v_acc0.Store4Val(afDest);
    3011             :     v_acc1.Store4Val(afDest + 4);
    3012             :     v_acc2.Store4Val(afDest + 8);
    3013             :     v_acc3.Store4Val(afDest + 12);
    3014             : }
    3015             : 
    3016             : template <class T>
    3017             : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
    3018             :                                                           const double *, int,
    3019             :                                                           double *)
    3020             : {
    3021             :     // Cannot be reached
    3022             :     CPLAssert(false);
    3023             : }
    3024             : 
    3025             : #else
    3026             : 
    3027             : /************************************************************************/
    3028             : /*               GDALResampleConvolutionVertical_8cols<T>               */
    3029             : /************************************************************************/
    3030             : 
    3031             : template <class T>
    3032             : static inline void
    3033    25689200 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
    3034             :                                       const double *padfWeights,
    3035             :                                       int nSrcLineCount, float *afDest)
    3036             : {
    3037    25689200 :     int i = 0;
    3038    25689200 :     size_t j = 0;
    3039    25689200 :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    3040    25689200 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3041    53654900 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    3042             :     {
    3043    27965700 :         XMMReg4Double w0 =
    3044    27965700 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    3045    27965700 :         XMMReg4Double w1 =
    3046    27965700 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    3047    27965700 :         XMMReg4Double w2 =
    3048    27965700 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    3049    27965700 :         XMMReg4Double w3 =
    3050    27965700 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    3051    27965700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    3052    27965700 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    3053    27965700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    3054    27965700 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    3055    27965700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    3056    27965700 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    3057    27965700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    3058    27965700 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    3059             :     }
    3060    37257700 :     for (; i < nSrcLineCount; ++i, j += nStride)
    3061             :     {
    3062    11568400 :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    3063    11568400 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    3064    11568400 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    3065             :     }
    3066    25689200 :     v_acc0.Store4Val(afDest);
    3067    25689200 :     v_acc1.Store4Val(afDest + 4);
    3068    25689200 : }
    3069             : 
    3070             : template <class T>
    3071             : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
    3072             :                                                          const double *, int,
    3073             :                                                          double *)
    3074             : {
    3075             :     // Cannot be reached
    3076             :     CPLAssert(false);
    3077             : }
    3078             : 
    3079             : #endif  // __AVX__
    3080             : 
    3081             : /************************************************************************/
    3082             : /*               GDALResampleConvolutionHorizontalSSE2<T>               */
    3083             : /************************************************************************/
    3084             : 
    3085             : template <class T>
    3086     3141452 : static inline double GDALResampleConvolutionHorizontalSSE2(
    3087             :     const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3088             : {
    3089     3141452 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3090     3141452 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    3091     3141452 :     int i = 0;  // Used after for.
    3092     3520408 :     for (; i < nSrcPixelCount - 7; i += 8)
    3093             :     {
    3094             :         // Retrieve the pixel & accumulate
    3095      378952 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
    3096      378952 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
    3097      378952 :         const XMMReg4Double v_weight1 =
    3098      378952 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3099      378952 :         const XMMReg4Double v_weight2 =
    3100      378952 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    3101             : 
    3102      378952 :         v_acc1 += v_pixels1 * v_weight1;
    3103      378952 :         v_acc2 += v_pixels2 * v_weight2;
    3104             :     }
    3105             : 
    3106     3141452 :     v_acc1 += v_acc2;
    3107             : 
    3108     3141452 :     double dfVal = v_acc1.GetHorizSum();
    3109    10321830 :     for (; i < nSrcPixelCount; ++i)
    3110             :     {
    3111     7180380 :         dfVal += pChunk[i] * padfWeightsAligned[i];
    3112             :     }
    3113     3141452 :     return dfVal;
    3114             : }
    3115             : 
    3116             : /************************************************************************/
    3117             : /*               GDALResampleConvolutionHorizontal<GByte>               */
    3118             : /************************************************************************/
    3119             : 
    3120             : template <>
    3121     2592290 : inline double GDALResampleConvolutionHorizontal<GByte>(
    3122             :     const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3123             : {
    3124     2592290 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    3125     2592290 :                                                  nSrcPixelCount);
    3126             : }
    3127             : 
    3128             : template <>
    3129      549162 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
    3130             :     const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3131             : {
    3132      549162 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    3133      549162 :                                                  nSrcPixelCount);
    3134             : }
    3135             : 
    3136             : /************************************************************************/
    3137             : /*           GDALResampleConvolutionHorizontalWithMaskSSE2<T>           */
    3138             : /************************************************************************/
    3139             : 
    3140             : template <class T>
    3141     6408653 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
    3142             :     const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
    3143             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    3144             : {
    3145     6408653 :     int i = 0;  // Used after for.
    3146     6408653 :     XMMReg4Double v_acc = XMMReg4Double::Zero();
    3147     6408653 :     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
    3148    17785121 :     for (; i < nSrcPixelCount - 3; i += 4)
    3149             :     {
    3150    11376458 :         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
    3151    11376458 :         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
    3152    11376458 :         XMMReg4Double v_weight =
    3153    11376458 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3154    11376458 :         v_weight *= v_mask;
    3155    11376458 :         v_acc += v_pixels * v_weight;
    3156    11376458 :         v_acc_weight += v_weight;
    3157             :     }
    3158             : 
    3159     6408653 :     dfVal = v_acc.GetHorizSum();
    3160     6408653 :     dfWeightSum = v_acc_weight.GetHorizSum();
    3161     6614913 :     for (; i < nSrcPixelCount; ++i)
    3162             :     {
    3163      206258 :         const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
    3164      206258 :         dfVal += pChunk[i] * dfWeight;
    3165      206258 :         dfWeightSum += dfWeight;
    3166             :     }
    3167     6408653 : }
    3168             : 
    3169             : /************************************************************************/
    3170             : /*           GDALResampleConvolutionHorizontalWithMask<GByte>           */
    3171             : /************************************************************************/
    3172             : 
    3173             : template <>
    3174     6408590 : inline void GDALResampleConvolutionHorizontalWithMask<GByte, false>(
    3175             :     const GByte *pChunk, const GByte *pabyMask,
    3176             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    3177             :     double &dfWeightSum)
    3178             : {
    3179     6408590 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    3180             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    3181             :         dfWeightSum);
    3182     6408590 : }
    3183             : 
    3184             : template <>
    3185          63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16, false>(
    3186             :     const GUInt16 *pChunk, const GByte *pabyMask,
    3187             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    3188             :     double &dfWeightSum)
    3189             : {
    3190          63 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    3191             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    3192             :         dfWeightSum);
    3193          63 : }
    3194             : 
    3195             : /************************************************************************/
    3196             : /*           GDALResampleConvolutionHorizontal_3rows_SSE2<T>            */
    3197             : /************************************************************************/
    3198             : 
    3199             : template <class T>
    3200    35560186 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
    3201             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3202             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3203             :     double &dfRes2, double &dfRes3)
    3204             : {
    3205    35560186 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
    3206    35560186 :                   v_acc2 = XMMReg4Double::Zero(),
    3207    35560186 :                   v_acc3 = XMMReg4Double::Zero();
    3208    35560186 :     int i = 0;
    3209    70929556 :     for (; i < nSrcPixelCount - 7; i += 8)
    3210             :     {
    3211             :         // Retrieve the pixel & accumulate.
    3212    35369370 :         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3213    35369370 :         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
    3214    35369370 :         const XMMReg4Double v_weight1 =
    3215    35369370 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3216    35369370 :         const XMMReg4Double v_weight2 =
    3217    35369370 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    3218             : 
    3219    35369370 :         v_acc1 += v_pixels1 * v_weight1;
    3220    35369370 :         v_acc1 += v_pixels2 * v_weight2;
    3221             : 
    3222    35369370 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3223    35369370 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
    3224    35369370 :         v_acc2 += v_pixels1 * v_weight1;
    3225    35369370 :         v_acc2 += v_pixels2 * v_weight2;
    3226             : 
    3227    35369370 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3228    35369370 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
    3229    35369370 :         v_acc3 += v_pixels1 * v_weight1;
    3230    35369370 :         v_acc3 += v_pixels2 * v_weight2;
    3231             :     }
    3232             : 
    3233    35560186 :     dfRes1 = v_acc1.GetHorizSum();
    3234    35560186 :     dfRes2 = v_acc2.GetHorizSum();
    3235    35560186 :     dfRes3 = v_acc3.GetHorizSum();
    3236    47825952 :     for (; i < nSrcPixelCount; ++i)
    3237             :     {
    3238    12265766 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3239    12265766 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3240    12265766 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3241             :     }
    3242    35560186 : }
    3243             : 
    3244             : /************************************************************************/
    3245             : /*            GDALResampleConvolutionHorizontal_3rows<GByte>            */
    3246             : /************************************************************************/
    3247             : 
    3248             : template <>
    3249    35560100 : inline void GDALResampleConvolutionHorizontal_3rows<GByte, false>(
    3250             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3251             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3252             :     double &dfRes2, double &dfRes3)
    3253             : {
    3254    35560100 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    3255             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3256             :         dfRes1, dfRes2, dfRes3);
    3257    35560100 : }
    3258             : 
    3259             : template <>
    3260          86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16, false>(
    3261             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3262             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3263             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3264             : {
    3265          86 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    3266             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3267             :         dfRes1, dfRes2, dfRes3);
    3268          86 : }
    3269             : 
    3270             : /************************************************************************/
    3271             : /*    GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>    */
    3272             : /************************************************************************/
    3273             : 
    3274             : template <class T>
    3275     7840250 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3276             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3277             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3278             :     double &dfRes2, double &dfRes3)
    3279             : {
    3280     7840250 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3281     7840250 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    3282     7840250 :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    3283     7840250 :     int i = 0;  // Use after for.
    3284    19104350 :     for (; i < nSrcPixelCount - 3; i += 4)
    3285             :     {
    3286             :         // Retrieve the pixel & accumulate.
    3287    11264100 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3288    11264100 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3289    11264100 :         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3290    11264100 :         const XMMReg4Double v_weight =
    3291    11264100 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3292             : 
    3293    11264100 :         v_acc1 += v_pixels1 * v_weight;
    3294    11264100 :         v_acc2 += v_pixels2 * v_weight;
    3295    11264100 :         v_acc3 += v_pixels3 * v_weight;
    3296             :     }
    3297             : 
    3298     7840250 :     dfRes1 = v_acc1.GetHorizSum();
    3299     7840250 :     dfRes2 = v_acc2.GetHorizSum();
    3300     7840250 :     dfRes3 = v_acc3.GetHorizSum();
    3301             : 
    3302    12290222 :     for (; i < nSrcPixelCount; ++i)
    3303             :     {
    3304     4449942 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3305     4449942 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3306     4449942 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3307             :     }
    3308     7840250 : }
    3309             : 
    3310             : /************************************************************************/
    3311             : /*    GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>     */
    3312             : /************************************************************************/
    3313             : 
    3314             : template <>
    3315             : inline void
    3316     7773100 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte, false>(
    3317             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3318             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3319             :     double &dfRes2, double &dfRes3)
    3320             : {
    3321     7773100 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3322             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3323             :         dfRes1, dfRes2, dfRes3);
    3324     7773100 : }
    3325             : 
    3326             : template <>
    3327             : inline void
    3328       67150 : GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16, false>(
    3329             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3330             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3331             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3332             : {
    3333       67150 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3334             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3335             :         dfRes1, dfRes2, dfRes3);
    3336       67150 : }
    3337             : 
    3338             : /************************************************************************/
    3339             : /*      GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>      */
    3340             : /************************************************************************/
    3341             : 
    3342             : template <class T>
    3343    13996740 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3344             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3345             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3346             :     double &dfRes3)
    3347             : {
    3348    13996740 :     const XMMReg4Double v_weight =
    3349             :         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
    3350             : 
    3351             :     // Retrieve the pixel & accumulate.
    3352    13996740 :     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
    3353    13996740 :     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
    3354    13996740 :     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
    3355             : 
    3356    13996740 :     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
    3357    13996740 :     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
    3358    13996740 :     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
    3359             : 
    3360    13996740 :     dfRes1 = v_acc1.GetHorizSum();
    3361    13996740 :     dfRes2 = v_acc2.GetHorizSum();
    3362    13996740 :     dfRes3 = v_acc3.GetHorizSum();
    3363    13996740 : }
    3364             : 
    3365             : /************************************************************************/
    3366             : /*      GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>       */
    3367             : /************************************************************************/
    3368             : 
    3369             : template <>
    3370     8284020 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte, false>(
    3371             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3372             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3373             :     double &dfRes3)
    3374             : {
    3375     8284020 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3376             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3377             :         dfRes3);
    3378     8284020 : }
    3379             : 
    3380             : template <>
    3381     5712720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16, false>(
    3382             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3383             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
    3384             :     double &dfRes2, double &dfRes3)
    3385             : {
    3386     5712720 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3387             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3388             :         dfRes3);
    3389     5712720 : }
    3390             : 
    3391             : #endif  // USE_SSE2
    3392             : 
    3393             : /************************************************************************/
    3394             : /*                   GDALResampleChunk_Convolution()                    */
    3395             : /************************************************************************/
    3396             : 
    3397             : template <class T, class Twork, GDALDataType eWrkDataType,
    3398             :           bool bKernelWithNegativeWeights, bool bNeedRescale>
    3399        5148 : static CPLErr GDALResampleChunk_ConvolutionT(
    3400             :     const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
    3401             :     FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
    3402             :     int nKernelRadius, float fMaxVal)
    3403             : 
    3404             : {
    3405        5148 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    3406        5148 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    3407        5148 :     const double dfSrcXDelta = args.dfSrcXDelta;
    3408        5148 :     const double dfSrcYDelta = args.dfSrcYDelta;
    3409        5148 :     constexpr int nBands = 1;
    3410        5148 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    3411        5148 :     const int nChunkXOff = args.nChunkXOff;
    3412        5148 :     const int nChunkXSize = args.nChunkXSize;
    3413        5148 :     const int nChunkYOff = args.nChunkYOff;
    3414        5148 :     const int nChunkYSize = args.nChunkYSize;
    3415        5148 :     const int nDstXOff = args.nDstXOff;
    3416        5148 :     const int nDstXOff2 = args.nDstXOff2;
    3417        5148 :     const int nDstYOff = args.nDstYOff;
    3418        5148 :     const int nDstYOff2 = args.nDstYOff2;
    3419        5148 :     const bool bHasNoData = args.bHasNoData;
    3420        5148 :     double dfNoDataValue = args.dfNoDataValue;
    3421             : 
    3422        5148 :     if (!bHasNoData)
    3423        5049 :         dfNoDataValue = 0.0;
    3424        5148 :     const auto dstDataType = args.eOvrDataType;
    3425        5148 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
    3426        5148 :     const double dfReplacementVal =
    3427          99 :         bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
    3428             :                    : dfNoDataValue;
    3429             :     // cppcheck-suppress unreadVariable
    3430        5148 :     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
    3431        5148 :     const bool bNoDataValueInt64Valid =
    3432        5148 :         isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
    3433        5148 :     const auto nNodataValueInt64 =
    3434             :         bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
    3435        5148 :     constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
    3436             : 
    3437             :     // TODO: we should have some generic function to do this.
    3438        5148 :     Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
    3439        5148 :     Twork fDstMax = cpl::NumericLimits<Twork>::max();
    3440        5148 :     if (dstDataType == GDT_UInt8)
    3441             :     {
    3442        4218 :         fDstMin = std::numeric_limits<GByte>::min();
    3443        4218 :         fDstMax = std::numeric_limits<GByte>::max();
    3444             :     }
    3445         930 :     else if (dstDataType == GDT_Int8)
    3446             :     {
    3447           1 :         fDstMin = std::numeric_limits<GInt8>::min();
    3448           1 :         fDstMax = std::numeric_limits<GInt8>::max();
    3449             :     }
    3450         929 :     else if (dstDataType == GDT_UInt16)
    3451             :     {
    3452         402 :         fDstMin = std::numeric_limits<GUInt16>::min();
    3453         402 :         fDstMax = std::numeric_limits<GUInt16>::max();
    3454             :     }
    3455         527 :     else if (dstDataType == GDT_Int16)
    3456             :     {
    3457         292 :         fDstMin = std::numeric_limits<GInt16>::min();
    3458         292 :         fDstMax = std::numeric_limits<GInt16>::max();
    3459             :     }
    3460         235 :     else if (dstDataType == GDT_UInt32)
    3461             :     {
    3462           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
    3463           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
    3464             :     }
    3465         234 :     else if (dstDataType == GDT_Int32)
    3466             :     {
    3467             :         // cppcheck-suppress unreadVariable
    3468           6 :         fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
    3469             :         // cppcheck-suppress unreadVariable
    3470           6 :         fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
    3471             :     }
    3472         228 :     else if (dstDataType == GDT_UInt64)
    3473             :     {
    3474             :         // cppcheck-suppress unreadVariable
    3475           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
    3476             :         // cppcheck-suppress unreadVariable
    3477             :         // (1 << 64) - 2048: largest uint64 value a double can hold
    3478           1 :         fDstMax = static_cast<Twork>(18446744073709549568ULL);
    3479             :     }
    3480         227 :     else if (dstDataType == GDT_Int64)
    3481             :     {
    3482             :         // cppcheck-suppress unreadVariable
    3483           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
    3484             :         // cppcheck-suppress unreadVariable
    3485             :         // (1 << 63) - 1024: largest int64 that a double can hold
    3486           1 :         fDstMax = static_cast<Twork>(9223372036854774784LL);
    3487             :     }
    3488             : 
    3489        5148 :     bool bHasNaN = false;
    3490         490 :     if (pabyChunkNodataMask)
    3491             :     {
    3492             :         if constexpr (std::is_floating_point_v<T>)
    3493             :         {
    3494      120140 :             for (size_t i = 0;
    3495      120140 :                  i < static_cast<size_t>(nChunkXSize) * nChunkYSize; ++i)
    3496             :             {
    3497      120122 :                 if (std::isnan(pChunk[i]))
    3498             :                 {
    3499          24 :                     bHasNaN = true;
    3500          24 :                     break;
    3501             :                 }
    3502             :             }
    3503             :         }
    3504             :     }
    3505             : 
    3506    36948368 :     auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
    3507             :                                bNoDataValueInt64Valid, nNodataValueInt64,
    3508             :                                dfNoDataValue, dfReplacementVal](Twork fVal)
    3509             :     {
    3510    15839600 :         if (!bHasNoData)
    3511    11618500 :             return fVal;
    3512             : 
    3513             :         // Clamp value before comparing to nodata: this is only needed for
    3514             :         // kernels with negative weights (Lanczos)
    3515     4221160 :         Twork fClamped = fVal;
    3516     4221160 :         if (fClamped < fDstMin)
    3517       15998 :             fClamped = fDstMin;
    3518     4205160 :         else if (fClamped > fDstMax)
    3519       16406 :             fClamped = fDstMax;
    3520     4221160 :         if (isIntegerDT)
    3521             :         {
    3522     4220480 :             if (bNoDataValueInt64Valid)
    3523             :             {
    3524     4220470 :                 const double fClampedRounded = double(std::round(fClamped));
    3525     8440960 :                 if (fClampedRounded >=
    3526             :                         static_cast<double>(static_cast<Twork>(
    3527     8440960 :                             std::numeric_limits<int64_t>::min())) &&
    3528             :                     fClampedRounded <= static_cast<double>(static_cast<Twork>(
    3529     8440960 :                                            9223372036854774784LL)) &&
    3530     4220470 :                     nNodataValueInt64 ==
    3531     4220480 :                         static_cast<GInt64>(std::round(fClamped)))
    3532             :                 {
    3533             :                     // Do not use the nodata value
    3534       14435 :                     return static_cast<Twork>(dfReplacementVal);
    3535             :                 }
    3536             :             }
    3537             :         }
    3538         679 :         else if (dfNoDataValue == static_cast<double>(fClamped))
    3539             :         {
    3540             :             // Do not use the nodata value
    3541           1 :             return static_cast<Twork>(dfReplacementVal);
    3542             :         }
    3543     4206720 :         return fClamped;
    3544             :     };
    3545             : 
    3546             :     /* -------------------------------------------------------------------- */
    3547             :     /*      Allocate work buffers.                                          */
    3548             :     /* -------------------------------------------------------------------- */
    3549        5148 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    3550        5148 :     Twork *pafWrkScanline = nullptr;
    3551        5148 :     if (dstDataType != eWrkDataType)
    3552             :     {
    3553             :         pafWrkScanline =
    3554        4936 :             static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
    3555        4936 :         if (pafWrkScanline == nullptr)
    3556           0 :             return CE_Failure;
    3557             :     }
    3558             : 
    3559        5148 :     const double dfXScale = 1.0 / dfXRatioDstToSrc;
    3560        5148 :     const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
    3561        5148 :     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
    3562        5148 :     const double dfYScale = 1.0 / dfYRatioDstToSrc;
    3563        5148 :     const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
    3564        5148 :     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
    3565             : 
    3566             :     // Temporary array to store result of horizontal filter.
    3567             :     double *const padfHorizontalFiltered = static_cast<double *>(
    3568        5148 :         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
    3569             : 
    3570             :     // To store convolution coefficients.
    3571             :     double *const padfWeights =
    3572        5148 :         static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
    3573             :             static_cast<int>(
    3574             :                 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5) *
    3575             :             sizeof(double)));
    3576             : 
    3577        5148 :     GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
    3578        5148 :     if (pabyChunkNodataMask)
    3579             :         pabyChunkNodataMaskHorizontalFiltered =
    3580         462 :             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
    3581        5148 :     if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
    3582         462 :         (pabyChunkNodataMask != nullptr &&
    3583             :          pabyChunkNodataMaskHorizontalFiltered == nullptr))
    3584             :     {
    3585           0 :         VSIFree(pafWrkScanline);
    3586           0 :         VSIFree(padfHorizontalFiltered);
    3587           0 :         VSIFreeAligned(padfWeights);
    3588           0 :         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3589           0 :         return CE_Failure;
    3590             :     }
    3591             : 
    3592             :     /* ==================================================================== */
    3593             :     /*      First pass: horizontal filter                                   */
    3594             :     /* ==================================================================== */
    3595        5148 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    3596             : #ifdef USE_SSE2
    3597        5148 :     const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
    3598             : #endif
    3599     3046832 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    3600             :     {
    3601     3041688 :         const double dfSrcPixel =
    3602     3041688 :             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
    3603     3041688 :         int nSrcPixelStart =
    3604     3041688 :             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
    3605     3041688 :         if (nSrcPixelStart < nChunkXOff)
    3606       57361 :             nSrcPixelStart = nChunkXOff;
    3607     3041688 :         int nSrcPixelStop =
    3608     3041688 :             static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
    3609     3041688 :         if (nSrcPixelStop > nChunkRightXOff)
    3610       57376 :             nSrcPixelStop = nChunkRightXOff;
    3611             : #if 0
    3612             :         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
    3613             :         {
    3614             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3615             :         }
    3616             :         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
    3617             :         {
    3618             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3619             :         }
    3620             : #endif
    3621     3041688 :         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
    3622     3041688 :         double dfWeightSum = 0.0;
    3623             : 
    3624             :         // Compute convolution coefficients.
    3625     3041688 :         int nSrcPixel = nSrcPixelStart;
    3626     3041688 :         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
    3627     4436866 :         for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
    3628             :         {
    3629     1395184 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
    3630     1395184 :             dfX += dfXScaleWeight;
    3631     1395184 :             padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
    3632     1395184 :             dfX += dfXScaleWeight;
    3633     1395184 :             padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
    3634     1395184 :             dfX += dfXScaleWeight;
    3635     1395184 :             padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
    3636     1395184 :             dfX += dfXScaleWeight;
    3637     1395184 :             dfWeightSum +=
    3638     1395184 :                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
    3639             :         }
    3640     7032688 :         for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
    3641             :         {
    3642     3991000 :             const double dfWeight = pfnFilterFunc(dfX);
    3643     3991000 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
    3644     3991000 :             dfWeightSum += dfWeight;
    3645             :         }
    3646             : 
    3647     3041688 :         const int nHeight = nChunkYSize * nBands;
    3648     3041688 :         if (pabyChunkNodataMask == nullptr)
    3649             :         {
    3650             :             // For floating-point data types, we must scale down a bit values
    3651             :             // if input values are close to +/- std::numeric_limits<T>::max()
    3652             : #ifdef OLD_CPPCHECK
    3653             :             constexpr double mulFactor = 1;
    3654             : #else
    3655     2958653 :             constexpr double mulFactor =
    3656             :                 (bNeedRescale &&
    3657             :                  (std::is_same_v<T, float> || std::is_same_v<T, double>))
    3658             :                     ? 2
    3659             :                     : 1;
    3660             : #endif
    3661             : 
    3662     2958653 :             if (dfWeightSum != 0)
    3663             :             {
    3664     2958653 :                 const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
    3665    11921984 :                 for (int i = 0; i < nSrcPixelCount; ++i)
    3666             :                 {
    3667     8963341 :                     padfWeights[i] *= dfInvWeightSum;
    3668             :                 }
    3669             :             }
    3670             : 
    3671   179403230 :             const auto ScaleValue = [
    3672             : #ifdef _MSC_VER
    3673             :                                         mulFactor
    3674             : #endif
    3675             :             ](double dfVal, [[maybe_unused]] const T *inputValues,
    3676             :                                     [[maybe_unused]] int nInputValues)
    3677             :             {
    3678   179403000 :                 constexpr bool isFloat =
    3679             :                     std::is_same_v<T, float> || std::is_same_v<T, double>;
    3680             :                 if constexpr (isFloat)
    3681             :                 {
    3682     4070140 :                     if (std::isfinite(dfVal))
    3683             :                     {
    3684             :                         return std::clamp(dfVal,
    3685    12204800 :                                           -std::numeric_limits<double>::max() /
    3686             :                                               mulFactor,
    3687     4068260 :                                           std::numeric_limits<double>::max() /
    3688     4068260 :                                               mulFactor) *
    3689     4068260 :                                mulFactor;
    3690             :                     }
    3691             :                     else if constexpr (bKernelWithNegativeWeights)
    3692             :                     {
    3693         936 :                         if (std::isnan(dfVal))
    3694             :                         {
    3695             :                             // Either one of the input value is NaN or they are +/-Inf
    3696         936 :                             const bool isPositive = inputValues[0] >= 0;
    3697        6008 :                             for (int i = 0; i < nInputValues; ++i)
    3698             :                             {
    3699        5384 :                                 if (std::isnan(inputValues[i]))
    3700         312 :                                     return dfVal;
    3701             :                                 // cppcheck-suppress knownConditionTrueFalse
    3702        5072 :                                 if ((inputValues[i] >= 0) != isPositive)
    3703           0 :                                     return dfVal;
    3704             :                             }
    3705             :                             // All values are positive or negative infinity
    3706         624 :                             return static_cast<double>(inputValues[0]);
    3707             :                         }
    3708             :                     }
    3709             :                 }
    3710   175334000 :                 return dfVal;
    3711             :             };
    3712             : 
    3713     2958653 :             int iSrcLineOff = 0;
    3714             : #ifdef USE_SSE2
    3715     2958653 :             if (nSrcPixelCount == 4)
    3716             :             {
    3717    15867269 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3718             :                 {
    3719    15253428 :                     const size_t j =
    3720    15253428 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3721    15253428 :                         (nSrcPixelStart - nChunkXOff);
    3722    15253428 :                     double dfVal1 = 0.0;
    3723    15253428 :                     double dfVal2 = 0.0;
    3724    15253428 :                     double dfVal3 = 0.0;
    3725             :                     if constexpr (std::is_floating_point_v<T>)
    3726             :                     {
    3727     1256690 :                         if (bHasNaN)
    3728             :                         {
    3729             :                             GDALResampleConvolutionHorizontalPixelCount4_3rows<
    3730           0 :                                 T, true>(pChunk + j, pChunk + j + nChunkXSize,
    3731           0 :                                          pChunk + j + 2 * nChunkXSize,
    3732             :                                          padfWeights, dfVal1, dfVal2, dfVal3);
    3733             :                         }
    3734             :                         else
    3735             :                         {
    3736             :                             GDALResampleConvolutionHorizontalPixelCount4_3rows<
    3737     1256690 :                                 T, false>(pChunk + j, pChunk + j + nChunkXSize,
    3738     1256690 :                                           pChunk + j + 2 * nChunkXSize,
    3739             :                                           padfWeights, dfVal1, dfVal2, dfVal3);
    3740             :                         }
    3741             :                     }
    3742             :                     else
    3743             :                     {
    3744             :                         GDALResampleConvolutionHorizontalPixelCount4_3rows<
    3745    13996738 :                             T, false>(pChunk + j, pChunk + j + nChunkXSize,
    3746    13996738 :                                       pChunk + j + 2 * nChunkXSize, padfWeights,
    3747             :                                       dfVal1, dfVal2, dfVal3);
    3748             :                     }
    3749    30506830 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3750    15253428 :                                                nDstXSize +
    3751    15253428 :                                            iDstPixel - nDstXOff] =
    3752    15253428 :                         ScaleValue(dfVal1, pChunk + j, 4);
    3753    30506830 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3754    15253428 :                                             1) *
    3755    15253428 :                                                nDstXSize +
    3756    15253428 :                                            iDstPixel - nDstXOff] =
    3757    15253428 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
    3758    15253837 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3759    15253428 :                                             2) *
    3760    15253428 :                                                nDstXSize +
    3761    15253428 :                                            iDstPixel - nDstXOff] =
    3762    15253428 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
    3763             :                 }
    3764             :             }
    3765     2344804 :             else if (bSrcPixelCountLess8)
    3766             :             {
    3767     9927838 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3768             :                 {
    3769     7859228 :                     const size_t j =
    3770     7859228 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3771     7859228 :                         (nSrcPixelStart - nChunkXOff);
    3772     7859228 :                     double dfVal1 = 0.0;
    3773     7859228 :                     double dfVal2 = 0.0;
    3774     7859228 :                     double dfVal3 = 0.0;
    3775             :                     if constexpr (std::is_floating_point_v<T>)
    3776             :                     {
    3777       18980 :                         if (bHasNaN)
    3778             :                         {
    3779             :                             GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
    3780           0 :                                 T, true>(pChunk + j, pChunk + j + nChunkXSize,
    3781           0 :                                          pChunk + j + 2 * nChunkXSize,
    3782             :                                          padfWeights, nSrcPixelCount, dfVal1,
    3783             :                                          dfVal2, dfVal3);
    3784             :                         }
    3785             :                         else
    3786             :                         {
    3787             :                             GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
    3788       18980 :                                 T, false>(pChunk + j, pChunk + j + nChunkXSize,
    3789       18980 :                                           pChunk + j + 2 * nChunkXSize,
    3790             :                                           padfWeights, nSrcPixelCount, dfVal1,
    3791             :                                           dfVal2, dfVal3);
    3792             :                         }
    3793             :                     }
    3794             :                     else
    3795             :                     {
    3796             :                         GDALResampleConvolutionHorizontalPixelCountLess8_3rows<
    3797     7840248 :                             T, false>(pChunk + j, pChunk + j + nChunkXSize,
    3798     7840248 :                                       pChunk + j + 2 * nChunkXSize, padfWeights,
    3799             :                                       nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3800             :                     }
    3801    15718416 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3802     7859228 :                                                nDstXSize +
    3803     7859228 :                                            iDstPixel - nDstXOff] =
    3804     7859228 :                         ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
    3805    15718416 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3806     7859228 :                                             1) *
    3807     7859228 :                                                nDstXSize +
    3808     7859228 :                                            iDstPixel - nDstXOff] =
    3809     7859228 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize,
    3810             :                                    nSrcPixelCount);
    3811     7859316 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3812     7859228 :                                             2) *
    3813     7859228 :                                                nDstXSize +
    3814     7859228 :                                            iDstPixel - nDstXOff] =
    3815     7859228 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
    3816             :                                    nSrcPixelCount);
    3817             :                 }
    3818             :             }
    3819             :             else
    3820             : #endif
    3821             :             {
    3822    35902058 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3823             :                 {
    3824    35625944 :                     const size_t j =
    3825    35625944 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3826    35625944 :                         (nSrcPixelStart - nChunkXOff);
    3827    35625944 :                     double dfVal1 = 0.0;
    3828    35625944 :                     double dfVal2 = 0.0;
    3829    35625944 :                     double dfVal3 = 0.0;
    3830             :                     if constexpr (std::is_floating_point_v<T>)
    3831             :                     {
    3832       65696 :                         if (bHasNaN)
    3833             :                         {
    3834           0 :                             GDALResampleConvolutionHorizontal_3rows<T, true>(
    3835           0 :                                 pChunk + j, pChunk + j + nChunkXSize,
    3836           0 :                                 pChunk + j + 2 * nChunkXSize, padfWeights,
    3837             :                                 nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3838             :                         }
    3839             :                         else
    3840             :                         {
    3841       65696 :                             GDALResampleConvolutionHorizontal_3rows<T, false>(
    3842       65696 :                                 pChunk + j, pChunk + j + nChunkXSize,
    3843       65696 :                                 pChunk + j + 2 * nChunkXSize, padfWeights,
    3844             :                                 nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3845             :                         }
    3846             :                     }
    3847             :                     else
    3848             :                     {
    3849    35560248 :                         GDALResampleConvolutionHorizontal_3rows<T, false>(
    3850    35560248 :                             pChunk + j, pChunk + j + nChunkXSize,
    3851    35560248 :                             pChunk + j + 2 * nChunkXSize, padfWeights,
    3852             :                             nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3853             :                     }
    3854    71251798 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3855    35625944 :                                                nDstXSize +
    3856    35625944 :                                            iDstPixel - nDstXOff] =
    3857    35625944 :                         ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
    3858    71251798 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3859    35625944 :                                             1) *
    3860    35625944 :                                                nDstXSize +
    3861    35625944 :                                            iDstPixel - nDstXOff] =
    3862    35625944 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize,
    3863             :                                    nSrcPixelCount);
    3864    35691048 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3865    35625944 :                                             2) *
    3866    35625944 :                                                nDstXSize +
    3867    35625944 :                                            iDstPixel - nDstXOff] =
    3868    35625944 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
    3869             :                                    nSrcPixelCount);
    3870             :                 }
    3871             :             }
    3872     6146150 :             for (; iSrcLineOff < nHeight; ++iSrcLineOff)
    3873             :             {
    3874     3187493 :                 const size_t j =
    3875     3187493 :                     static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3876     3187493 :                     (nSrcPixelStart - nChunkXOff);
    3877     3736653 :                 const double dfVal = GDALResampleConvolutionHorizontal(
    3878      595200 :                     pChunk + j, padfWeights, nSrcPixelCount);
    3879     3187942 :                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3880     3187493 :                                            nDstXSize +
    3881     3187493 :                                        iDstPixel - nDstXOff] =
    3882     3187493 :                     ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
    3883             :             }
    3884             :         }
    3885             :         else
    3886             :         {
    3887    19189371 :             for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
    3888             :             {
    3889    19106322 :                 const size_t j =
    3890    19106322 :                     static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3891    19106322 :                     (nSrcPixelStart - nChunkXOff);
    3892             : 
    3893             :                 if (bKernelWithNegativeWeights)
    3894             :                 {
    3895    18580308 :                     int nConsecutiveValid = 0;
    3896    18580308 :                     int nMaxConsecutiveValid = 0;
    3897   170151146 :                     for (int k = 0; k < nSrcPixelCount; k++)
    3898             :                     {
    3899   151569938 :                         if (pabyChunkNodataMask[j + k])
    3900    43681801 :                             nConsecutiveValid++;
    3901   107888837 :                         else if (nConsecutiveValid)
    3902             :                         {
    3903      107830 :                             nMaxConsecutiveValid = std::max(
    3904      107830 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3905      107830 :                             nConsecutiveValid = 0;
    3906             :                         }
    3907             :                     }
    3908    18580308 :                     nMaxConsecutiveValid =
    3909    18580308 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3910    18580308 :                     if (nMaxConsecutiveValid < nSrcPixelCount / 2)
    3911             :                     {
    3912    12651307 :                         const size_t nTempOffset =
    3913    12651307 :                             static_cast<size_t>(iSrcLineOff) * nDstXSize +
    3914    12651307 :                             iDstPixel - nDstXOff;
    3915    12651307 :                         padfHorizontalFiltered[nTempOffset] = 0.0;
    3916    12651307 :                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3917    12651307 :                         continue;
    3918             :                     }
    3919             :                 }
    3920             : 
    3921     6455025 :                 double dfVal = 0.0;
    3922             :                 if constexpr (std::is_floating_point_v<T>)
    3923             :                 {
    3924       46368 :                     if (bHasNaN)
    3925             :                     {
    3926        1792 :                         GDALResampleConvolutionHorizontalWithMask<T, true>(
    3927        1792 :                             pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3928             :                             nSrcPixelCount, dfVal, dfWeightSum);
    3929             :                     }
    3930             :                     else
    3931             :                     {
    3932       44576 :                         GDALResampleConvolutionHorizontalWithMask<T, false>(
    3933       44576 :                             pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3934             :                             nSrcPixelCount, dfVal, dfWeightSum);
    3935             :                     }
    3936             :                 }
    3937             :                 else
    3938             :                 {
    3939     6408657 :                     GDALResampleConvolutionHorizontalWithMask<T, false>(
    3940          63 :                         pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3941             :                         nSrcPixelCount, dfVal, dfWeightSum);
    3942             :                 }
    3943     6455025 :                 const size_t nTempOffset =
    3944     6455025 :                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
    3945     6455025 :                     nDstXOff;
    3946     6455025 :                 if (dfWeightSum > 0.0)
    3947             :                 {
    3948     6410360 :                     padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
    3949     6410360 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
    3950             :                 }
    3951             :                 else
    3952             :                 {
    3953       44663 :                     padfHorizontalFiltered[nTempOffset] = 0.0;
    3954       44663 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3955             :                 }
    3956             :             }
    3957             :         }
    3958             :     }
    3959             : 
    3960             :     /* ==================================================================== */
    3961             :     /*      Second pass: vertical filter                                    */
    3962             :     /* ==================================================================== */
    3963        5148 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    3964             : 
    3965      396762 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3966             :     {
    3967      391614 :         Twork *const pafDstScanline =
    3968             :             pafWrkScanline
    3969      391614 :                 ? pafWrkScanline
    3970       14028 :                 : static_cast<Twork *>(pDstBuffer) +
    3971       14028 :                       static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
    3972             : 
    3973      391614 :         const double dfSrcLine =
    3974      391614 :             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
    3975      391614 :         int nSrcLineStart =
    3976      391614 :             static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
    3977      391614 :         int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
    3978      391614 :         if (nSrcLineStart < nChunkYOff)
    3979        3486 :             nSrcLineStart = nChunkYOff;
    3980      391614 :         if (nSrcLineStop > nChunkBottomYOff)
    3981        3530 :             nSrcLineStop = nChunkBottomYOff;
    3982             : #if 0
    3983             :         if( nSrcLineStart < nChunkYOff &&
    3984             :             nChunkYOff > 0 )
    3985             :         {
    3986             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3987             :         }
    3988             :         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
    3989             :         {
    3990             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3991             :         }
    3992             : #endif
    3993      391614 :         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
    3994      391614 :         double dfWeightSum = 0.0;
    3995             : 
    3996             :         // Compute convolution coefficients.
    3997      391614 :         int nSrcLine = nSrcLineStart;  // Used after for.
    3998      391614 :         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
    3999     1004175 :         for (; nSrcLine < nSrcLineStop - 3;
    4000      612561 :              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
    4001             :         {
    4002      612561 :             padfWeights[nSrcLine - nSrcLineStart] = dfY;
    4003      612561 :             padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
    4004      612561 :             padfWeights[nSrcLine + 2 - nSrcLineStart] =
    4005      612561 :                 dfY + 2 * dfYScaleWeight;
    4006      612561 :             padfWeights[nSrcLine + 3 - nSrcLineStart] =
    4007      612561 :                 dfY + 3 * dfYScaleWeight;
    4008      612561 :             dfWeightSum +=
    4009      612561 :                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
    4010             :         }
    4011      429592 :         for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
    4012             :         {
    4013       37978 :             const double dfWeight = pfnFilterFunc(dfY);
    4014       37978 :             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
    4015       37978 :             dfWeightSum += dfWeight;
    4016             :         }
    4017             : 
    4018      391614 :         if (pabyChunkNodataMask == nullptr)
    4019             :         {
    4020             :             // For floating-point data types, we must scale down a bit values
    4021             :             // if input values are close to +/- std::numeric_limits<T>::max()
    4022             : #ifdef OLD_CPPCHECK
    4023             :             constexpr double mulFactor = 1;
    4024             : #else
    4025      355578 :             constexpr double mulFactor =
    4026             :                 (bNeedRescale &&
    4027             :                  (std::is_same_v<T, float> || std::is_same_v<T, double>))
    4028             :                     ? 2
    4029             :                     : 1;
    4030             : #endif
    4031             : 
    4032      355578 :             if (dfWeightSum != 0)
    4033             :             {
    4034      355578 :                 const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
    4035     2594627 :                 for (int i = 0; i < nSrcLineCount; ++i)
    4036     2239055 :                     padfWeights[i] *= dfInvWeightSum;
    4037             :             }
    4038             : 
    4039      355578 :             int iFilteredPixelOff = 0;  // Used after for.
    4040             :             // j used after for.
    4041      355578 :             size_t j =
    4042      355578 :                 (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
    4043             : #ifdef USE_SSE2
    4044             :             if constexpr ((!bNeedRescale || !std::is_same_v<T, float>) &&
    4045             :                           eWrkDataType == GDT_Float32)
    4046             :             {
    4047             : #ifdef __AVX__
    4048             :                 for (; iFilteredPixelOff < nDstXSize - 15;
    4049             :                      iFilteredPixelOff += 16, j += 16)
    4050             :                 {
    4051             :                     GDALResampleConvolutionVertical_16cols(
    4052             :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4053             :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    4054             :                     if (bHasNoData)
    4055             :                     {
    4056             :                         for (int k = 0; k < 16; k++)
    4057             :                         {
    4058             :                             pafDstScanline[iFilteredPixelOff + k] =
    4059             :                                 replaceValIfNodata(
    4060             :                                     pafDstScanline[iFilteredPixelOff + k]);
    4061             :                         }
    4062             :                     }
    4063             :                 }
    4064             : #else
    4065    26036009 :                 for (; iFilteredPixelOff < nDstXSize - 7;
    4066             :                      iFilteredPixelOff += 8, j += 8)
    4067             :                 {
    4068    25689208 :                     GDALResampleConvolutionVertical_8cols(
    4069    25689208 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4070    25689208 :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    4071    25689208 :                     if (bHasNoData)
    4072             :                     {
    4073      123192 :                         for (int k = 0; k < 8; k++)
    4074             :                         {
    4075      109504 :                             pafDstScanline[iFilteredPixelOff + k] =
    4076      109504 :                                 replaceValIfNodata(
    4077      109504 :                                     pafDstScanline[iFilteredPixelOff + k]);
    4078             :                         }
    4079             :                     }
    4080             :                 }
    4081             : #endif
    4082             : 
    4083      816719 :                 for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
    4084             :                 {
    4085      469960 :                     const Twork fVal =
    4086      469960 :                         static_cast<Twork>(GDALResampleConvolutionVertical(
    4087      469960 :                             padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4088             :                             nSrcLineCount));
    4089      469960 :                     pafDstScanline[iFilteredPixelOff] =
    4090      469960 :                         replaceValIfNodata(fVal);
    4091             :                 }
    4092             :             }
    4093             :             else
    4094             : #endif
    4095             :             {
    4096     5862642 :                 const auto ScaleValue = [
    4097             : #ifdef _MSC_VER
    4098             :                                             mulFactor
    4099             : #endif
    4100             :                 ](double dfVal, [[maybe_unused]] const double *inputValues,
    4101             :                                         [[maybe_unused]] int nStride,
    4102             :                                         [[maybe_unused]] int nInputValues)
    4103             :                 {
    4104     5862640 :                     constexpr bool isFloat =
    4105             :                         std::is_same_v<T, float> || std::is_same_v<T, double>;
    4106             :                     if constexpr (isFloat)
    4107             :                     {
    4108     5862640 :                         if (std::isfinite(dfVal))
    4109             :                         {
    4110             :                             return std::clamp(
    4111             :                                        dfVal,
    4112             :                                        static_cast<double>(
    4113    17585400 :                                            -std::numeric_limits<Twork>::max()) /
    4114             :                                            mulFactor,
    4115             :                                        static_cast<double>(
    4116     5861800 :                                            std::numeric_limits<Twork>::max()) /
    4117     5861800 :                                            mulFactor) *
    4118     5861800 :                                    mulFactor;
    4119             :                         }
    4120             :                         else if constexpr (bKernelWithNegativeWeights)
    4121             :                         {
    4122         480 :                             if (std::isnan(dfVal))
    4123             :                             {
    4124             :                                 // Either one of the input value is NaN or they are +/-Inf
    4125         480 :                                 const bool isPositive = inputValues[0] >= 0;
    4126        2520 :                                 for (int i = 0; i < nInputValues; ++i)
    4127             :                                 {
    4128        2200 :                                     if (std::isnan(inputValues[i * nStride]))
    4129         160 :                                         return dfVal;
    4130             :                                     // cppcheck-suppress knownConditionTrueFalse
    4131        2040 :                                     if ((inputValues[i] >= 0) != isPositive)
    4132           0 :                                         return dfVal;
    4133             :                                 }
    4134             :                                 // All values are positive or negative infinity
    4135         320 :                                 return inputValues[0];
    4136             :                             }
    4137             :                         }
    4138             :                     }
    4139             : 
    4140         360 :                     return dfVal;
    4141             :                 };
    4142             : 
    4143     2939422 :                 for (; iFilteredPixelOff < nDstXSize - 1;
    4144             :                      iFilteredPixelOff += 2, j += 2)
    4145             :                 {
    4146     2930610 :                     double dfVal1 = 0.0;
    4147     2930610 :                     double dfVal2 = 0.0;
    4148     2930610 :                     GDALResampleConvolutionVertical_2cols(
    4149     2930610 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4150             :                         nSrcLineCount, dfVal1, dfVal2);
    4151     5861220 :                     pafDstScanline[iFilteredPixelOff] =
    4152     2930610 :                         replaceValIfNodata(static_cast<Twork>(
    4153     2930610 :                             ScaleValue(dfVal1, padfHorizontalFiltered + j,
    4154             :                                        nDstXSize, nSrcLineCount)));
    4155     2930610 :                     pafDstScanline[iFilteredPixelOff + 1] =
    4156     2930610 :                         replaceValIfNodata(static_cast<Twork>(
    4157     2930610 :                             ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
    4158             :                                        nDstXSize, nSrcLineCount)));
    4159             :                 }
    4160        8819 :                 if (iFilteredPixelOff < nDstXSize)
    4161             :                 {
    4162        1427 :                     const double dfVal = GDALResampleConvolutionVertical(
    4163        1427 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4164             :                         nSrcLineCount);
    4165        1427 :                     pafDstScanline[iFilteredPixelOff] =
    4166        1427 :                         replaceValIfNodata(static_cast<Twork>(
    4167        1427 :                             ScaleValue(dfVal, padfHorizontalFiltered + j,
    4168             :                                        nDstXSize, nSrcLineCount)));
    4169             :                 }
    4170             :             }
    4171             :         }
    4172             :         else
    4173             :         {
    4174    18368135 :             for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
    4175             :                  ++iFilteredPixelOff)
    4176             :             {
    4177    18332129 :                 double dfVal = 0.0;
    4178    18332129 :                 dfWeightSum = 0.0;
    4179    18332129 :                 size_t j = (nSrcLineStart - nChunkYOff) *
    4180    18332129 :                                static_cast<size_t>(nDstXSize) +
    4181    18332129 :                            iFilteredPixelOff;
    4182             :                 if (bKernelWithNegativeWeights)
    4183             :                 {
    4184    18088237 :                     int nConsecutiveValid = 0;
    4185    18088237 :                     int nMaxConsecutiveValid = 0;
    4186   127259921 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    4187             :                     {
    4188   109171284 :                         const double dfWeight =
    4189   109171284 :                             padfWeights[i] *
    4190             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    4191   109171284 :                         if (pabyChunkNodataMaskHorizontalFiltered[j])
    4192             :                         {
    4193    46111301 :                             nConsecutiveValid++;
    4194             :                         }
    4195    63060183 :                         else if (nConsecutiveValid)
    4196             :                         {
    4197      204376 :                             nMaxConsecutiveValid = std::max(
    4198      204376 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    4199      204376 :                             nConsecutiveValid = 0;
    4200             :                         }
    4201   109171284 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    4202   109171284 :                         dfWeightSum += dfWeight;
    4203             :                     }
    4204    18088237 :                     nMaxConsecutiveValid =
    4205    18088237 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    4206    18088237 :                     if (nMaxConsecutiveValid < nSrcLineCount / 2)
    4207             :                     {
    4208     8918591 :                         pafDstScanline[iFilteredPixelOff] =
    4209     8918499 :                             static_cast<Twork>(dfNoDataValue);
    4210     8918591 :                         continue;
    4211             :                     }
    4212             :                 }
    4213             :                 else
    4214             :                 {
    4215     1239606 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    4216             :                     {
    4217      995712 :                         const double dfWeight =
    4218      995712 :                             padfWeights[i] *
    4219             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    4220      995712 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    4221      995712 :                         dfWeightSum += dfWeight;
    4222             :                     }
    4223             :                 }
    4224     9413558 :                 if (dfWeightSum > 0.0)
    4225             :                 {
    4226     9397519 :                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
    4227     9397171 :                         static_cast<Twork>(dfVal / dfWeightSum));
    4228             :                 }
    4229             :                 else
    4230             :                 {
    4231       16045 :                     pafDstScanline[iFilteredPixelOff] =
    4232       16021 :                         static_cast<Twork>(dfNoDataValue);
    4233             :                 }
    4234             :             }
    4235             :         }
    4236             : 
    4237      391614 :         if (fMaxVal != 0.0f)
    4238             :         {
    4239             :             if constexpr (std::is_same_v<T, double>)
    4240             :             {
    4241           0 :                 for (int i = 0; i < nDstXSize; ++i)
    4242             :                 {
    4243           0 :                     if (pafDstScanline[i] > static_cast<double>(fMaxVal))
    4244           0 :                         pafDstScanline[i] = static_cast<double>(fMaxVal);
    4245             :                 }
    4246             :             }
    4247             :             else
    4248             :             {
    4249      192324 :                 for (int i = 0; i < nDstXSize; ++i)
    4250             :                 {
    4251      192088 :                     if (pafDstScanline[i] > fMaxVal)
    4252       96022 :                         pafDstScanline[i] = fMaxVal;
    4253             :                 }
    4254             :             }
    4255             :         }
    4256             : 
    4257      391614 :         if (pafWrkScanline)
    4258             :         {
    4259      377586 :             GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
    4260             :                             static_cast<GByte *>(pDstBuffer) +
    4261      377586 :                                 static_cast<size_t>(iDstLine - nDstYOff) *
    4262      377586 :                                     nDstXSize * nDstDataTypeSize,
    4263             :                             dstDataType, nDstDataTypeSize, nDstXSize);
    4264             :         }
    4265             :     }
    4266             : 
    4267        5148 :     VSIFree(pafWrkScanline);
    4268        5148 :     VSIFreeAligned(padfWeights);
    4269        5148 :     VSIFree(padfHorizontalFiltered);
    4270        5148 :     VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    4271             : 
    4272        5148 :     return CE_None;
    4273             : }
    4274             : 
    4275             : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
    4276             : static CPLErr
    4277        5148 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
    4278             :                                       const void *pChunk, void **ppDstBuffer,
    4279             :                                       GDALDataType *peDstBufferDataType)
    4280             : {
    4281             :     GDALResampleAlg eResample;
    4282        5148 :     if (EQUAL(args.pszResampling, "BILINEAR"))
    4283        2666 :         eResample = GRA_Bilinear;
    4284        2482 :     else if (EQUAL(args.pszResampling, "CUBIC"))
    4285        2300 :         eResample = GRA_Cubic;
    4286         182 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    4287          86 :         eResample = GRA_CubicSpline;
    4288          96 :     else if (EQUAL(args.pszResampling, "LANCZOS"))
    4289          96 :         eResample = GRA_Lanczos;
    4290             :     else
    4291             :     {
    4292           0 :         CPLAssert(false);
    4293             :         return CE_Failure;
    4294             :     }
    4295        5148 :     const int nKernelRadius = GWKGetFilterRadius(eResample);
    4296        5148 :     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
    4297             :     const FilterFunc4ValuesType pfnFilterFunc4Values =
    4298        5148 :         GWKGetFilterFunc4Values(eResample);
    4299             : 
    4300        5148 :     float fMaxVal = 0.f;
    4301             :     // Cubic, etc... can have overshoots, so make sure we clamp values to the
    4302             :     // maximum value if NBITS is set.
    4303        5148 :     if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
    4304           8 :         (args.eOvrDataType == GDT_UInt8 || args.eOvrDataType == GDT_UInt16 ||
    4305           0 :          args.eOvrDataType == GDT_UInt32))
    4306             :     {
    4307           8 :         int nBits = args.nOvrNBITS;
    4308           8 :         if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
    4309           1 :             nBits = 0;
    4310           8 :         if (nBits > 0 && nBits < 32)
    4311           7 :             fMaxVal = static_cast<float>((1U << nBits) - 1);
    4312             :     }
    4313             : 
    4314        5148 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    4315             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    4316             :         GDALGetDataTypeSizeBytes(args.eOvrDataType));
    4317        5148 :     if (*ppDstBuffer == nullptr)
    4318             :     {
    4319           0 :         return CE_Failure;
    4320             :     }
    4321        5148 :     *peDstBufferDataType = args.eOvrDataType;
    4322             : 
    4323        5148 :     switch (args.eWrkDataType)
    4324             :     {
    4325        4256 :         case GDT_UInt8:
    4326             :         {
    4327             :             return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
    4328             :                                                   bKernelWithNegativeWeights,
    4329        4256 :                                                   bNeedRescale>(
    4330             :                 args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
    4331        4256 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4332             :         }
    4333             : 
    4334         402 :         case GDT_UInt16:
    4335             :         {
    4336             :             return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
    4337             :                                                   bKernelWithNegativeWeights,
    4338         402 :                                                   bNeedRescale>(
    4339             :                 args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
    4340         402 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4341             :         }
    4342             : 
    4343         387 :         case GDT_Float32:
    4344             :         {
    4345             :             return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
    4346             :                                                   bKernelWithNegativeWeights,
    4347         387 :                                                   bNeedRescale>(
    4348             :                 args, static_cast<const float *>(pChunk), *ppDstBuffer,
    4349         387 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4350             :         }
    4351             : 
    4352         103 :         case GDT_Float64:
    4353             :         {
    4354             :             return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
    4355             :                                                   bKernelWithNegativeWeights,
    4356         103 :                                                   bNeedRescale>(
    4357             :                 args, static_cast<const double *>(pChunk), *ppDstBuffer,
    4358         103 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4359             :         }
    4360             : 
    4361           0 :         default:
    4362           0 :             break;
    4363             :     }
    4364             : 
    4365           0 :     CPLAssert(false);
    4366             :     return CE_Failure;
    4367             : }
    4368             : 
    4369             : static CPLErr
    4370        5148 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
    4371             :                               const void *pChunk, void **ppDstBuffer,
    4372             :                               GDALDataType *peDstBufferDataType)
    4373             : {
    4374        5148 :     if (EQUAL(args.pszResampling, "CUBIC") ||
    4375        2848 :         EQUAL(args.pszResampling, "LANCZOS"))
    4376             :         return GDALResampleChunk_ConvolutionInternal<
    4377        2396 :             /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
    4378        2396 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4379        2752 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    4380          86 :         return GDALResampleChunk_ConvolutionInternal<false, true>(
    4381          86 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4382             :     else
    4383        2666 :         return GDALResampleChunk_ConvolutionInternal<false, false>(
    4384        2666 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4385             : }
    4386             : 
    4387             : /************************************************************************/
    4388             : /*                       GDALResampleChunkC32R()                        */
    4389             : /************************************************************************/
    4390             : 
    4391           2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
    4392             :                                     const float *pafChunk, const int nChunkYOff,
    4393             :                                     const int nChunkYSize, const int nDstYOff,
    4394             :                                     const int nDstYOff2, const int nOvrXSize,
    4395             :                                     const int nOvrYSize, void **ppDstBuffer,
    4396             :                                     GDALDataType *peDstBufferDataType,
    4397             :                                     const char *pszResampling)
    4398             : 
    4399             : {
    4400             :     enum Method
    4401             :     {
    4402             :         NEAR,
    4403             :         AVERAGE,
    4404             :         AVERAGE_MAGPHASE,
    4405             :         RMS,
    4406             :     };
    4407             : 
    4408           2 :     Method eMethod = NEAR;
    4409           2 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4410             :     {
    4411           0 :         eMethod = NEAR;
    4412             :     }
    4413           2 :     else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
    4414             :     {
    4415           0 :         eMethod = AVERAGE_MAGPHASE;
    4416             :     }
    4417           2 :     else if (EQUAL(pszResampling, "RMS"))
    4418             :     {
    4419           2 :         eMethod = RMS;
    4420             :     }
    4421           0 :     else if (STARTS_WITH_CI(pszResampling, "AVER"))
    4422             :     {
    4423           0 :         eMethod = AVERAGE;
    4424             :     }
    4425             :     else
    4426             :     {
    4427           0 :         CPLError(
    4428             :             CE_Failure, CPLE_NotSupported,
    4429             :             "Resampling method %s is not supported for complex data types. "
    4430             :             "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
    4431             :             pszResampling);
    4432           0 :         return CE_Failure;
    4433             :     }
    4434             : 
    4435           2 :     const int nOXSize = nOvrXSize;
    4436           2 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
    4437             :                                        GDALGetDataTypeSizeBytes(GDT_CFloat32));
    4438           2 :     if (*ppDstBuffer == nullptr)
    4439             :     {
    4440           0 :         return CE_Failure;
    4441             :     }
    4442           2 :     float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
    4443           2 :     *peDstBufferDataType = GDT_CFloat32;
    4444             : 
    4445           2 :     const int nOYSize = nOvrYSize;
    4446           2 :     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
    4447           2 :     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
    4448             : 
    4449             :     /* ==================================================================== */
    4450             :     /*      Loop over destination scanlines.                                */
    4451             :     /* ==================================================================== */
    4452           8 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    4453             :     {
    4454           6 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    4455           6 :         if (nSrcYOff < nChunkYOff)
    4456           0 :             nSrcYOff = nChunkYOff;
    4457             : 
    4458           6 :         int nSrcYOff2 =
    4459           6 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
    4460           6 :         if (nSrcYOff2 == nSrcYOff)
    4461           0 :             nSrcYOff2++;
    4462             : 
    4463           6 :         if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
    4464             :         {
    4465           2 :             if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
    4466           0 :                 nSrcYOff = nSrcHeight - 1;
    4467           2 :             nSrcYOff2 = nSrcHeight;
    4468             :         }
    4469           6 :         if (nSrcYOff2 > nChunkYOff + nChunkYSize)
    4470           0 :             nSrcYOff2 = nChunkYOff + nChunkYSize;
    4471             : 
    4472           6 :         const float *const pafSrcScanline =
    4473           6 :             pafChunk +
    4474           6 :             (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
    4475           6 :         float *const pafDstScanline =
    4476           6 :             pafDstBuffer +
    4477           6 :             static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
    4478             : 
    4479             :         /* --------------------------------------------------------------------
    4480             :          */
    4481             :         /*      Loop over destination pixels */
    4482             :         /* --------------------------------------------------------------------
    4483             :          */
    4484          18 :         for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
    4485             :         {
    4486          12 :             const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
    4487          12 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    4488          12 :             int nSrcXOff2 =
    4489          12 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
    4490          12 :             if (nSrcXOff2 == nSrcXOff)
    4491           0 :                 nSrcXOff2++;
    4492          12 :             if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
    4493             :             {
    4494           6 :                 if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
    4495           0 :                     nSrcXOff = nSrcWidth - 1;
    4496           6 :                 nSrcXOff2 = nSrcWidth;
    4497             :             }
    4498          12 :             const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
    4499             : 
    4500          12 :             if (eMethod == NEAR)
    4501             :             {
    4502           0 :                 pafDstScanline[iDstPixelSZ * 2] =
    4503           0 :                     pafSrcScanline[nSrcXOffSZ * 2];
    4504           0 :                 pafDstScanline[iDstPixelSZ * 2 + 1] =
    4505           0 :                     pafSrcScanline[nSrcXOffSZ * 2 + 1];
    4506             :             }
    4507          12 :             else if (eMethod == AVERAGE_MAGPHASE)
    4508             :             {
    4509           0 :                 double dfTotalR = 0.0;
    4510           0 :                 double dfTotalI = 0.0;
    4511           0 :                 double dfTotalM = 0.0;
    4512           0 :                 size_t nCount = 0;
    4513             : 
    4514           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4515             :                 {
    4516           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4517             :                     {
    4518           0 :                         const double dfR = double(
    4519           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4520           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4521           0 :                                                nSrcWidth * 2]);
    4522           0 :                         const double dfI = double(
    4523           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4524           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4525           0 :                                                nSrcWidth * 2 +
    4526           0 :                                            1]);
    4527           0 :                         dfTotalR += dfR;
    4528           0 :                         dfTotalI += dfI;
    4529           0 :                         dfTotalM += std::hypot(dfR, dfI);
    4530           0 :                         ++nCount;
    4531             :                     }
    4532             :                 }
    4533             : 
    4534           0 :                 CPLAssert(nCount > 0);
    4535           0 :                 if (nCount == 0)
    4536             :                 {
    4537           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4538           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4539             :                 }
    4540             :                 else
    4541             :                 {
    4542           0 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4543           0 :                         dfTotalR / static_cast<double>(nCount));
    4544           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4545           0 :                         dfTotalI / static_cast<double>(nCount));
    4546             :                     const double dfM =
    4547           0 :                         double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
    4548           0 :                                           pafDstScanline[iDstPixelSZ * 2 + 1]));
    4549           0 :                     const double dfDesiredM =
    4550           0 :                         dfTotalM / static_cast<double>(nCount);
    4551           0 :                     double dfRatio = 1.0;
    4552           0 :                     if (dfM != 0.0)
    4553           0 :                         dfRatio = dfDesiredM / dfM;
    4554             : 
    4555           0 :                     pafDstScanline[iDstPixelSZ * 2] *=
    4556           0 :                         static_cast<float>(dfRatio);
    4557           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] *=
    4558           0 :                         static_cast<float>(dfRatio);
    4559             :                 }
    4560             :             }
    4561          12 :             else if (eMethod == RMS)
    4562             :             {
    4563          12 :                 double dfTotalR = 0.0;
    4564          12 :                 double dfTotalI = 0.0;
    4565          12 :                 size_t nCount = 0;
    4566             : 
    4567          36 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4568             :                 {
    4569          72 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4570             :                     {
    4571          48 :                         const double dfR = double(
    4572          48 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4573          48 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4574          48 :                                                nSrcWidth * 2]);
    4575          48 :                         const double dfI = double(
    4576          48 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4577          48 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4578          48 :                                                nSrcWidth * 2 +
    4579          48 :                                            1]);
    4580             : 
    4581          48 :                         dfTotalR += SQUARE(dfR);
    4582          48 :                         dfTotalI += SQUARE(dfI);
    4583             : 
    4584          48 :                         ++nCount;
    4585             :                     }
    4586             :                 }
    4587             : 
    4588          12 :                 CPLAssert(nCount > 0);
    4589          12 :                 if (nCount == 0)
    4590             :                 {
    4591           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4592           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4593             :                 }
    4594             :                 else
    4595             :                 {
    4596             :                     /* compute RMS */
    4597          12 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4598          12 :                         sqrt(dfTotalR / static_cast<double>(nCount)));
    4599          12 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4600          12 :                         sqrt(dfTotalI / static_cast<double>(nCount)));
    4601             :                 }
    4602             :             }
    4603           0 :             else if (eMethod == AVERAGE)
    4604             :             {
    4605           0 :                 double dfTotalR = 0.0;
    4606           0 :                 double dfTotalI = 0.0;
    4607           0 :                 size_t nCount = 0;
    4608             : 
    4609           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4610             :                 {
    4611           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4612             :                     {
    4613             :                         // TODO(schwehr): Maybe use std::complex?
    4614           0 :                         dfTotalR += double(
    4615           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4616           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4617           0 :                                                nSrcWidth * 2]);
    4618           0 :                         dfTotalI += double(
    4619           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4620           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4621           0 :                                                nSrcWidth * 2 +
    4622           0 :                                            1]);
    4623           0 :                         ++nCount;
    4624             :                     }
    4625             :                 }
    4626             : 
    4627           0 :                 CPLAssert(nCount > 0);
    4628           0 :                 if (nCount == 0)
    4629             :                 {
    4630           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4631           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4632             :                 }
    4633             :                 else
    4634             :                 {
    4635           0 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4636           0 :                         dfTotalR / static_cast<double>(nCount));
    4637           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4638           0 :                         dfTotalI / static_cast<double>(nCount));
    4639             :                 }
    4640             :             }
    4641             :         }
    4642             :     }
    4643             : 
    4644           2 :     return CE_None;
    4645             : }
    4646             : 
    4647             : /************************************************************************/
    4648             : /*                  GDALRegenerateCascadingOverviews()                  */
    4649             : /*                                                                      */
    4650             : /*      Generate a list of overviews in order from largest to           */
    4651             : /*      smallest, computing each from the next larger.                  */
    4652             : /************************************************************************/
    4653             : 
    4654          44 : static CPLErr GDALRegenerateCascadingOverviews(
    4655             :     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
    4656             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    4657             :     void *pProgressData, CSLConstList papszOptions)
    4658             : 
    4659             : {
    4660             :     /* -------------------------------------------------------------------- */
    4661             :     /*      First, we must put the overviews in order from largest to       */
    4662             :     /*      smallest.                                                       */
    4663             :     /* -------------------------------------------------------------------- */
    4664         127 :     for (int i = 0; i < nOverviews - 1; ++i)
    4665             :     {
    4666         292 :         for (int j = 0; j < nOverviews - i - 1; ++j)
    4667             :         {
    4668         209 :             if (papoOvrBands[j]->GetXSize() *
    4669         209 :                     static_cast<float>(papoOvrBands[j]->GetYSize()) <
    4670         209 :                 papoOvrBands[j + 1]->GetXSize() *
    4671         209 :                     static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
    4672             :             {
    4673           0 :                 GDALRasterBand *poTempBand = papoOvrBands[j];
    4674           0 :                 papoOvrBands[j] = papoOvrBands[j + 1];
    4675           0 :                 papoOvrBands[j + 1] = poTempBand;
    4676             :             }
    4677             :         }
    4678             :     }
    4679             : 
    4680             :     /* -------------------------------------------------------------------- */
    4681             :     /*      Count total pixels so we can prepare appropriate scaled         */
    4682             :     /*      progress functions.                                             */
    4683             :     /* -------------------------------------------------------------------- */
    4684          44 :     double dfTotalPixels = 0.0;
    4685             : 
    4686         171 :     for (int i = 0; i < nOverviews; ++i)
    4687             :     {
    4688         127 :         dfTotalPixels += papoOvrBands[i]->GetXSize() *
    4689         127 :                          static_cast<double>(papoOvrBands[i]->GetYSize());
    4690             :     }
    4691             : 
    4692             :     /* -------------------------------------------------------------------- */
    4693             :     /*      Generate all the bands.                                         */
    4694             :     /* -------------------------------------------------------------------- */
    4695          44 :     double dfPixelsProcessed = 0.0;
    4696             : 
    4697          88 :     CPLStringList aosOptions(papszOptions);
    4698          44 :     aosOptions.SetNameValue("CASCADING", "YES");
    4699         171 :     for (int i = 0; i < nOverviews; ++i)
    4700             :     {
    4701         127 :         GDALRasterBand *poBaseBand = poSrcBand;
    4702         127 :         if (i != 0)
    4703          83 :             poBaseBand = papoOvrBands[i - 1];
    4704             : 
    4705         127 :         double dfPixels = papoOvrBands[i]->GetXSize() *
    4706         127 :                           static_cast<double>(papoOvrBands[i]->GetYSize());
    4707             : 
    4708         254 :         void *pScaledProgressData = GDALCreateScaledProgress(
    4709             :             dfPixelsProcessed / dfTotalPixels,
    4710         127 :             (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
    4711             :             pProgressData);
    4712             : 
    4713         254 :         const CPLErr eErr = GDALRegenerateOverviewsEx(
    4714             :             poBaseBand, 1,
    4715         127 :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
    4716             :             pszResampling, GDALScaledProgress, pScaledProgressData,
    4717         127 :             aosOptions.List());
    4718         127 :         GDALDestroyScaledProgress(pScaledProgressData);
    4719             : 
    4720         127 :         if (eErr != CE_None)
    4721           0 :             return eErr;
    4722             : 
    4723         127 :         dfPixelsProcessed += dfPixels;
    4724             : 
    4725             :         // Only do the bit2grayscale promotion on the base band.
    4726         127 :         if (STARTS_WITH_CI(pszResampling,
    4727             :                            "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
    4728           8 :             pszResampling = "AVERAGE";
    4729             :     }
    4730             : 
    4731          44 :     return CE_None;
    4732             : }
    4733             : 
    4734             : /************************************************************************/
    4735             : /*                      GDALGetResampleFunction()                       */
    4736             : /************************************************************************/
    4737             : 
    4738       16187 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
    4739             :                                              int *pnRadius)
    4740             : {
    4741       16187 :     if (pnRadius)
    4742       16187 :         *pnRadius = 0;
    4743       16187 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4744         533 :         return GDALResampleChunk_Near;
    4745       15654 :     else if (STARTS_WITH_CI(pszResampling, "AVER") ||
    4746        4426 :              EQUAL(pszResampling, "RMS"))
    4747       11293 :         return GDALResampleChunk_AverageOrRMS;
    4748        4361 :     else if (EQUAL(pszResampling, "GAUSS"))
    4749             :     {
    4750          26 :         if (pnRadius)
    4751          26 :             *pnRadius = 1;
    4752          26 :         return GDALResampleChunk_Gauss;
    4753             :     }
    4754        4335 :     else if (EQUAL(pszResampling, "MODE"))
    4755         142 :         return GDALResampleChunk_Mode;
    4756        4193 :     else if (EQUAL(pszResampling, "CUBIC"))
    4757             :     {
    4758        1647 :         if (pnRadius)
    4759        1647 :             *pnRadius = GWKGetFilterRadius(GRA_Cubic);
    4760        1647 :         return GDALResampleChunk_Convolution;
    4761             :     }
    4762        2546 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    4763             :     {
    4764          60 :         if (pnRadius)
    4765          60 :             *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
    4766          60 :         return GDALResampleChunk_Convolution;
    4767             :     }
    4768        2486 :     else if (EQUAL(pszResampling, "LANCZOS"))
    4769             :     {
    4770          50 :         if (pnRadius)
    4771          50 :             *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
    4772          50 :         return GDALResampleChunk_Convolution;
    4773             :     }
    4774        2436 :     else if (EQUAL(pszResampling, "BILINEAR"))
    4775             :     {
    4776        2436 :         if (pnRadius)
    4777        2436 :             *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
    4778        2436 :         return GDALResampleChunk_Convolution;
    4779             :     }
    4780             :     else
    4781             :     {
    4782           0 :         CPLError(
    4783             :             CE_Failure, CPLE_AppDefined,
    4784             :             "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
    4785             :             pszResampling);
    4786           0 :         return nullptr;
    4787             :     }
    4788             : }
    4789             : 
    4790             : /************************************************************************/
    4791             : /*                       GDALGetOvrWorkDataType()                       */
    4792             : /************************************************************************/
    4793             : 
    4794       16069 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
    4795             :                                     GDALDataType eSrcDataType)
    4796             : {
    4797       16069 :     if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
    4798             :     {
    4799         667 :         return eSrcDataType;
    4800             :     }
    4801       15402 :     else if (eSrcDataType == GDT_UInt8 &&
    4802       14829 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4803        3699 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4804        2294 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4805        2274 :               EQUAL(pszResampling, "LANCZOS") ||
    4806        2267 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4807             :     {
    4808       14822 :         return GDT_UInt8;
    4809             :     }
    4810         580 :     else if (eSrcDataType == GDT_UInt16 &&
    4811         131 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4812         126 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4813           8 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4814           6 :               EQUAL(pszResampling, "LANCZOS") ||
    4815           3 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4816             :     {
    4817         131 :         return GDT_UInt16;
    4818             :     }
    4819         449 :     else if (EQUAL(pszResampling, "GAUSS"))
    4820          20 :         return GDT_Float64;
    4821             : 
    4822         429 :     if (eSrcDataType == GDT_UInt8 || eSrcDataType == GDT_Int8 ||
    4823         428 :         eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
    4824             :         eSrcDataType == GDT_Float32)
    4825             :     {
    4826         277 :         return GDT_Float32;
    4827             :     }
    4828         152 :     return GDT_Float64;
    4829             : }
    4830             : 
    4831             : namespace
    4832             : {
    4833             : // Structure to hold a pointer to free with CPLFree()
    4834             : struct PointerHolder
    4835             : {
    4836             :     void *ptr = nullptr;
    4837             : 
    4838        4054 :     template <class T> explicit PointerHolder(T *&ptrIn) : ptr(ptrIn)
    4839             :     {
    4840        4054 :         ptrIn = nullptr;
    4841        4054 :     }
    4842             : 
    4843             :     template <class T>
    4844          32 :     explicit PointerHolder(std::unique_ptr<T, VSIFreeReleaser> ptrIn)
    4845          32 :         : ptr(ptrIn.release())
    4846             :     {
    4847          32 :     }
    4848             : 
    4849        4086 :     ~PointerHolder()
    4850        4086 :     {
    4851        4086 :         CPLFree(ptr);
    4852        4086 :     }
    4853             : 
    4854             :     PointerHolder(const PointerHolder &) = delete;
    4855             :     PointerHolder &operator=(const PointerHolder &) = delete;
    4856             : };
    4857             : }  // namespace
    4858             : 
    4859             : /************************************************************************/
    4860             : /*                      GDALRegenerateOverviews()                       */
    4861             : /************************************************************************/
    4862             : 
    4863             : /**
    4864             :  * \brief Generate downsampled overviews.
    4865             :  *
    4866             :  * This function will generate one or more overview images from a base image
    4867             :  * using the requested downsampling algorithm.  Its primary use is for
    4868             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4869             :  * used to generate downsampled images in one file from another outside the
    4870             :  * overview architecture.
    4871             :  *
    4872             :  * The output bands need to exist in advance.
    4873             :  *
    4874             :  * The full set of resampling algorithms is documented in
    4875             :  * GDALDataset::BuildOverviews().
    4876             :  *
    4877             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4878             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4879             :  * considered as the nodata value and not each value of the triplet
    4880             :  * independently per band.
    4881             :  *
    4882             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4883             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4884             :  * overview computation.
    4885             :  *
    4886             :  * @param hSrcBand the source (base level) band.
    4887             :  * @param nOverviewCount the number of downsampled bands being generated.
    4888             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4889             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4890             :  * @param pfnProgress progress report function.
    4891             :  * @param pProgressData progress function callback data.
    4892             :  * @return CE_None on success or CE_Failure on failure.
    4893             :  */
    4894         113 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
    4895             :                                GDALRasterBandH *pahOvrBands,
    4896             :                                const char *pszResampling,
    4897             :                                GDALProgressFunc pfnProgress,
    4898             :                                void *pProgressData)
    4899             : 
    4900             : {
    4901         113 :     return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
    4902             :                                      pszResampling, pfnProgress, pProgressData,
    4903         113 :                                      nullptr);
    4904             : }
    4905             : 
    4906             : /************************************************************************/
    4907             : /*                     GDALRegenerateOverviewsEx()                      */
    4908             : /************************************************************************/
    4909             : 
    4910             : constexpr int RADIUS_TO_DIAMETER = 2;
    4911             : 
    4912             : /**
    4913             :  * \brief Generate downsampled overviews.
    4914             :  *
    4915             :  * This function will generate one or more overview images from a base image
    4916             :  * using the requested downsampling algorithm.  Its primary use is for
    4917             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4918             :  * used to generate downsampled images in one file from another outside the
    4919             :  * overview architecture.
    4920             :  *
    4921             :  * The output bands need to exist in advance.
    4922             :  *
    4923             :  * The full set of resampling algorithms is documented in
    4924             :  * GDALDataset::BuildOverviews().
    4925             :  *
    4926             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4927             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4928             :  * considered as the nodata value and not each value of the triplet
    4929             :  * independently per band.
    4930             :  *
    4931             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4932             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4933             :  * overview computation.
    4934             :  *
    4935             :  * @param hSrcBand the source (base level) band.
    4936             :  * @param nOverviewCount the number of downsampled bands being generated.
    4937             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4938             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4939             :  * @param pfnProgress progress report function.
    4940             :  * @param pProgressData progress function callback data.
    4941             :  * @param papszOptions NULL terminated list of options as key=value pairs, or
    4942             :  * NULL
    4943             :  * @return CE_None on success or CE_Failure on failure.
    4944             :  * @since GDAL 3.6
    4945             :  */
    4946         780 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
    4947             :                                  GDALRasterBandH *pahOvrBands,
    4948             :                                  const char *pszResampling,
    4949             :                                  GDALProgressFunc pfnProgress,
    4950             :                                  void *pProgressData, CSLConstList papszOptions)
    4951             : 
    4952             : {
    4953         780 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4954         780 :     GDALRasterBand **papoOvrBands =
    4955             :         reinterpret_cast<GDALRasterBand **>(pahOvrBands);
    4956             : 
    4957         780 :     if (pfnProgress == nullptr)
    4958         102 :         pfnProgress = GDALDummyProgress;
    4959             : 
    4960         780 :     if (EQUAL(pszResampling, "NONE"))
    4961          50 :         return CE_None;
    4962             : 
    4963         730 :     int nKernelRadius = 0;
    4964             :     GDALResampleFunction pfnResampleFn =
    4965         730 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4966             : 
    4967         730 :     if (pfnResampleFn == nullptr)
    4968           0 :         return CE_Failure;
    4969             : 
    4970             :     /* -------------------------------------------------------------------- */
    4971             :     /*      Check color tables...                                           */
    4972             :     /* -------------------------------------------------------------------- */
    4973         730 :     GDALColorTable *poColorTable = nullptr;
    4974             : 
    4975         507 :     if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4976        1538 :          EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
    4977         312 :         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4978             :     {
    4979           9 :         poColorTable = poSrcBand->GetColorTable();
    4980           9 :         if (poColorTable != nullptr)
    4981             :         {
    4982           9 :             if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
    4983             :             {
    4984           0 :                 CPLError(CE_Warning, CPLE_AppDefined,
    4985             :                          "Computing overviews on palette index raster bands "
    4986             :                          "with a palette whose color interpretation is not RGB "
    4987             :                          "will probably lead to unexpected results.");
    4988           0 :                 poColorTable = nullptr;
    4989             :             }
    4990           9 :             else if (poColorTable->IsIdentity())
    4991             :             {
    4992           0 :                 poColorTable = nullptr;
    4993             :             }
    4994             :         }
    4995             :         else
    4996             :         {
    4997           0 :             CPLError(CE_Warning, CPLE_AppDefined,
    4998             :                      "Computing overviews on palette index raster bands "
    4999             :                      "without a palette will probably lead to unexpected "
    5000             :                      "results.");
    5001             :         }
    5002             :     }
    5003             :     // Not ready yet
    5004        2109 :     else if ((EQUAL(pszResampling, "CUBIC") ||
    5005         667 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    5006         667 :               EQUAL(pszResampling, "LANCZOS") ||
    5007        1468 :               EQUAL(pszResampling, "BILINEAR")) &&
    5008          80 :              poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    5009             :     {
    5010           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    5011             :                  "Computing %s overviews on palette index raster bands "
    5012             :                  "will probably lead to unexpected results.",
    5013             :                  pszResampling);
    5014             :     }
    5015             : 
    5016             :     // If we have a nodata mask and we are doing something more complicated
    5017             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5018             : 
    5019         730 :     GDALRasterBand *poMaskBand = nullptr;
    5020         730 :     bool bUseNoDataMask = false;
    5021         730 :     bool bCanUseCascaded = true;
    5022             : 
    5023         730 :     if (!STARTS_WITH_CI(pszResampling, "NEAR"))
    5024             :     {
    5025             :         // Special case if we are an alpha/mask band. We want it to be
    5026             :         // considered as the mask band to avoid alpha=0 to be taken into account
    5027             :         // in average computation.
    5028         392 :         if (poSrcBand->IsMaskBand())
    5029             :         {
    5030          51 :             poMaskBand = poSrcBand;
    5031          51 :             bUseNoDataMask = true;
    5032             :         }
    5033             :         else
    5034             :         {
    5035         341 :             poMaskBand = poSrcBand->GetMaskBand();
    5036         341 :             const int nMaskFlags = poSrcBand->GetMaskFlags();
    5037         341 :             bCanUseCascaded =
    5038         341 :                 (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
    5039         341 :             bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
    5040             :         }
    5041             :     }
    5042             : 
    5043         730 :     int nHasNoData = 0;
    5044         730 :     const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
    5045         730 :     const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    5046             :     const bool bPropagateNoData =
    5047         730 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5048             : 
    5049         798 :     if (poSrcBand->GetBand() == 1 && bUseNoDataMask &&
    5050          68 :         CSLFetchNameValue(papszOptions, "CASCADING") == nullptr)
    5051             :     {
    5052         112 :         std::string osDetailMessage;
    5053          56 :         if (poSrcBand->HasConflictingMaskSources(&osDetailMessage, false))
    5054             :         {
    5055           2 :             CPLError(
    5056             :                 CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
    5057             :                 bHasNoData
    5058             :                     ? "Only the nodata value will be taken into account."
    5059             :                     : "Only the first listed one will be taken into account.");
    5060             :         }
    5061             :     }
    5062             : 
    5063             :     /* -------------------------------------------------------------------- */
    5064             :     /*      If we are operating on multiple overviews, and using            */
    5065             :     /*      averaging, lets do them in cascading order to reduce the        */
    5066             :     /*      amount of computation.                                          */
    5067             :     /* -------------------------------------------------------------------- */
    5068             : 
    5069             :     // In case the mask made be computed from another band of the dataset,
    5070             :     // we can't use cascaded generation, as the computation of the overviews
    5071             :     // of the band used for the mask band may not have yet occurred (#3033).
    5072         730 :     if ((STARTS_WITH_CI(pszResampling, "AVER") ||
    5073         507 :          EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
    5074         476 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    5075         422 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    5076         730 :          EQUAL(pszResampling, "MODE")) &&
    5077          44 :         nOverviewCount > 1 && bCanUseCascaded)
    5078          44 :         return GDALRegenerateCascadingOverviews(
    5079             :             poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
    5080          44 :             pProgressData, papszOptions);
    5081             : 
    5082             :     /* -------------------------------------------------------------------- */
    5083             :     /*      Setup one horizontal swath to read from the raw buffer.         */
    5084             :     /* -------------------------------------------------------------------- */
    5085         686 :     int nFRXBlockSize = 0;
    5086         686 :     int nFRYBlockSize = 0;
    5087         686 :     poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
    5088             : 
    5089         686 :     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
    5090        1034 :     const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
    5091         984 :                                        EQUAL(pszResampling, "MODE") ||
    5092         298 :                                        !GDALDataTypeIsComplex(eSrcDataType);
    5093             :     const GDALDataType eWrkDataType =
    5094             :         bUseGenericResampleFn
    5095         686 :             ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
    5096         686 :             : GDT_CFloat32;
    5097             : 
    5098         686 :     const int nWidth = poSrcBand->GetXSize();
    5099         686 :     const int nHeight = poSrcBand->GetYSize();
    5100             : 
    5101         686 :     int nMaxOvrFactor = 1;
    5102        1491 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    5103             :     {
    5104         805 :         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
    5105         805 :         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
    5106         805 :         nMaxOvrFactor = std::max(
    5107             :             nMaxOvrFactor,
    5108         805 :             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
    5109         805 :         nMaxOvrFactor = std::max(
    5110             :             nMaxOvrFactor,
    5111         805 :             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
    5112             :     }
    5113             : 
    5114         686 :     int nFullResYChunk = nFRYBlockSize;
    5115         686 :     int nMaxChunkYSizeQueried = 0;
    5116             : 
    5117             :     const auto UpdateChunkHeightAndGetChunkSize =
    5118        9220 :         [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
    5119       74721 :          eWrkDataType, nWidth]()
    5120             :     {
    5121             :         // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
    5122             :         // + nFullResYChunk) / nMaxOvrFactor)
    5123        9220 :         if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
    5124             :         {
    5125           1 :             return GINTBIG_MAX;
    5126             :         }
    5127        9219 :         nFullResYChunk =
    5128        9219 :             std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
    5129        9219 :         if ((nKernelRadius > 0 &&
    5130         970 :              nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
    5131        9219 :             nFullResYChunk >
    5132        9219 :                 INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
    5133             :         {
    5134           0 :             return GINTBIG_MAX;
    5135             :         }
    5136        9219 :         nMaxChunkYSizeQueried =
    5137        9219 :             nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
    5138        9219 :         if (GDALGetDataTypeSizeBytes(eWrkDataType) >
    5139        9219 :             std::numeric_limits<int64_t>::max() /
    5140        9219 :                 (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
    5141             :         {
    5142           1 :             return GINTBIG_MAX;
    5143             :         }
    5144        9218 :         return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
    5145        9218 :                nMaxChunkYSizeQueried * nWidth;
    5146         686 :     };
    5147             : 
    5148             :     const char *pszChunkYSize =
    5149         686 :         CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
    5150             : #ifndef __COVERITY__
    5151             :     // Only configurable for debug / testing
    5152         686 :     if (pszChunkYSize)
    5153             :     {
    5154           0 :         nFullResYChunk = atoi(pszChunkYSize);
    5155             :     }
    5156             : #endif
    5157             : 
    5158             :     // Only configurable for debug / testing
    5159             :     const int nChunkMaxSize =
    5160         686 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    5161             : 
    5162         686 :     auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
    5163         686 :     if (nChunkSize > nChunkMaxSize)
    5164             :     {
    5165          15 :         if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
    5166          44 :             !GDALDataTypeIsComplex(eSrcDataType) &&
    5167          14 :             (!STARTS_WITH_CI(pszResampling, "AVER") ||
    5168           2 :              EQUAL(pszResampling, "AVERAGE")))
    5169             :         {
    5170             :             // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
    5171             :             // which use a block based strategy, which is much less memory
    5172             :             // hungry.
    5173          14 :             return GDALRegenerateOverviewsMultiBand(
    5174             :                 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
    5175          14 :                 pfnProgress, pProgressData, papszOptions);
    5176             :         }
    5177           1 :         else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
    5178             :         {
    5179           0 :             return GDALRegenerateCascadingOverviews(
    5180             :                 poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
    5181           0 :                 pfnProgress, pProgressData, papszOptions);
    5182             :         }
    5183             :     }
    5184         671 :     else if (pszChunkYSize == nullptr)
    5185             :     {
    5186             :         // Try to get as close as possible to nChunkMaxSize
    5187        9205 :         while (nChunkSize < nChunkMaxSize / 2)
    5188             :         {
    5189        8534 :             nFullResYChunk *= 2;
    5190        8534 :             nChunkSize = UpdateChunkHeightAndGetChunkSize();
    5191             :         }
    5192             :     }
    5193             : 
    5194             :     // Structure describing a resampling job
    5195             :     struct OvrJob
    5196             :     {
    5197             :         // Buffers to free when job is finished
    5198             :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
    5199             :         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
    5200             :         std::unique_ptr<PointerHolder> oDstBufferHolder{};
    5201             : 
    5202             :         GDALRasterBand *poDstBand = nullptr;
    5203             : 
    5204             :         // Input parameters of pfnResampleFn
    5205             :         GDALResampleFunction pfnResampleFn = nullptr;
    5206             :         int nSrcWidth = 0;
    5207             :         int nSrcHeight = 0;
    5208             :         int nDstWidth = 0;
    5209             :         GDALOverviewResampleArgs args{};
    5210             :         const void *pChunk = nullptr;
    5211             :         bool bUseGenericResampleFn = false;
    5212             : 
    5213             :         // Output values of resampling function
    5214             :         CPLErr eErr = CE_Failure;
    5215             :         void *pDstBuffer = nullptr;
    5216             :         GDALDataType eDstBufferDataType = GDT_Unknown;
    5217             : 
    5218           0 :         void SetSrcMaskBufferHolder(
    5219             :             const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
    5220             :         {
    5221           0 :             oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
    5222           0 :         }
    5223             : 
    5224           0 :         void SetSrcBufferHolder(
    5225             :             const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
    5226             :         {
    5227           0 :             oSrcBufferHolder = oSrcBufferHolderIn;
    5228           0 :         }
    5229             : 
    5230         774 :         void NotifyFinished()
    5231             :         {
    5232        1548 :             std::lock_guard guard(mutex);
    5233         774 :             bFinished = true;
    5234         774 :             cv.notify_one();
    5235         774 :         }
    5236             : 
    5237           0 :         bool IsFinished()
    5238             :         {
    5239           0 :             std::lock_guard guard(mutex);
    5240           0 :             return bFinished;
    5241             :         }
    5242             : 
    5243           0 :         void WaitFinished()
    5244             :         {
    5245           0 :             std::unique_lock oGuard(mutex);
    5246           0 :             while (!bFinished)
    5247             :             {
    5248           0 :                 cv.wait(oGuard);
    5249             :             }
    5250           0 :         }
    5251             : 
    5252             :       private:
    5253             :         // Synchronization
    5254             :         bool bFinished = false;
    5255             :         std::mutex mutex{};
    5256             :         std::condition_variable cv{};
    5257             :     };
    5258             : 
    5259             :     // Thread function to resample
    5260         774 :     const auto JobResampleFunc = [](void *pData)
    5261             :     {
    5262         774 :         OvrJob *poJob = static_cast<OvrJob *>(pData);
    5263             : 
    5264         774 :         if (poJob->bUseGenericResampleFn)
    5265             :         {
    5266         772 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    5267             :                                                &(poJob->pDstBuffer),
    5268             :                                                &(poJob->eDstBufferDataType));
    5269             :         }
    5270             :         else
    5271             :         {
    5272           2 :             poJob->eErr = GDALResampleChunkC32R(
    5273             :                 poJob->nSrcWidth, poJob->nSrcHeight,
    5274           2 :                 static_cast<const float *>(poJob->pChunk),
    5275             :                 poJob->args.nChunkYOff, poJob->args.nChunkYSize,
    5276             :                 poJob->args.nDstYOff, poJob->args.nDstYOff2,
    5277             :                 poJob->args.nOvrXSize, poJob->args.nOvrYSize,
    5278             :                 &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
    5279             :                 poJob->args.pszResampling);
    5280             :         }
    5281             : 
    5282         774 :         auto pDstBuffer = poJob->pDstBuffer;
    5283         774 :         poJob->oDstBufferHolder = std::make_unique<PointerHolder>(pDstBuffer);
    5284             : 
    5285         774 :         poJob->NotifyFinished();
    5286         774 :     };
    5287             : 
    5288             :     // Function to write resample data to target band
    5289         774 :     const auto WriteJobData = [](const OvrJob *poJob)
    5290             :     {
    5291        1548 :         return poJob->poDstBand->RasterIO(
    5292         774 :             GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
    5293         774 :             poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    5294         774 :             poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    5295         774 :             poJob->eDstBufferDataType, 0, 0, nullptr);
    5296             :     };
    5297             : 
    5298             :     // Wait for completion of oldest job and serialize it
    5299             :     const auto WaitAndFinalizeOldestJob =
    5300           0 :         [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    5301             :     {
    5302           0 :         auto poOldestJob = jobList.front().get();
    5303           0 :         poOldestJob->WaitFinished();
    5304           0 :         CPLErr l_eErr = poOldestJob->eErr;
    5305           0 :         if (l_eErr == CE_None)
    5306             :         {
    5307           0 :             l_eErr = WriteJobData(poOldestJob);
    5308             :         }
    5309             : 
    5310           0 :         jobList.pop_front();
    5311           0 :         return l_eErr;
    5312             :     };
    5313             : 
    5314             :     // Queue of jobs
    5315        1344 :     std::list<std::unique_ptr<OvrJob>> jobList;
    5316             : 
    5317         672 :     GByte *pabyChunkNodataMask = nullptr;
    5318         672 :     void *pChunk = nullptr;
    5319             : 
    5320         672 :     const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
    5321             :                                            /* bDefaultToAllCPUs=*/false);
    5322             :     auto poThreadPool =
    5323         672 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5324             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5325        1344 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5326             : 
    5327             :     /* -------------------------------------------------------------------- */
    5328             :     /*      Loop over image operating on chunks.                            */
    5329             :     /* -------------------------------------------------------------------- */
    5330         672 :     int nChunkYOff = 0;
    5331         672 :     CPLErr eErr = CE_None;
    5332             : 
    5333        1349 :     for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
    5334         677 :          nChunkYOff += nFullResYChunk)
    5335             :     {
    5336         677 :         if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
    5337             :                          pProgressData))
    5338             :         {
    5339           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5340           0 :             eErr = CE_Failure;
    5341             :         }
    5342             : 
    5343         677 :         if (nFullResYChunk + nChunkYOff > nHeight)
    5344         669 :             nFullResYChunk = nHeight - nChunkYOff;
    5345             : 
    5346         677 :         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
    5347         677 :         int nChunkYSizeQueried =
    5348         677 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    5349         677 :         if (nChunkYOffQueried < 0)
    5350             :         {
    5351          83 :             nChunkYSizeQueried += nChunkYOffQueried;
    5352          83 :             nChunkYOffQueried = 0;
    5353             :         }
    5354         677 :         if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
    5355          83 :             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
    5356             : 
    5357             :         // Avoid accumulating too many tasks and exhaust RAM
    5358             :         // Try to complete already finished jobs
    5359         677 :         while (eErr == CE_None && !jobList.empty())
    5360             :         {
    5361           0 :             auto poOldestJob = jobList.front().get();
    5362           0 :             if (!poOldestJob->IsFinished())
    5363           0 :                 break;
    5364           0 :             eErr = poOldestJob->eErr;
    5365           0 :             if (eErr == CE_None)
    5366             :             {
    5367           0 :                 eErr = WriteJobData(poOldestJob);
    5368             :             }
    5369             : 
    5370           0 :             jobList.pop_front();
    5371             :         }
    5372             : 
    5373             :         // And in case we have saturated the number of threads,
    5374             :         // wait for completion of tasks to go below the threshold.
    5375        1354 :         while (eErr == CE_None &&
    5376         677 :                jobList.size() >= static_cast<size_t>(nThreads))
    5377             :         {
    5378           0 :             eErr = WaitAndFinalizeOldestJob(jobList);
    5379             :         }
    5380             : 
    5381             :         // (Re)allocate buffers if needed
    5382         677 :         if (pChunk == nullptr)
    5383             :         {
    5384         672 :             pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    5385             :                                          nMaxChunkYSizeQueried, nWidth);
    5386             :         }
    5387         677 :         if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
    5388             :         {
    5389         139 :             pabyChunkNodataMask = static_cast<GByte *>(
    5390         139 :                 VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
    5391             :         }
    5392             : 
    5393         677 :         if (pChunk == nullptr ||
    5394         139 :             (bUseNoDataMask && pabyChunkNodataMask == nullptr))
    5395             :         {
    5396           0 :             CPLFree(pChunk);
    5397           0 :             CPLFree(pabyChunkNodataMask);
    5398           0 :             return CE_Failure;
    5399             :         }
    5400             : 
    5401             :         // Read chunk.
    5402         677 :         if (eErr == CE_None)
    5403         677 :             eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    5404             :                                        nChunkYSizeQueried, pChunk, nWidth,
    5405             :                                        nChunkYSizeQueried, eWrkDataType, 0, 0,
    5406             :                                        nullptr);
    5407         677 :         if (eErr == CE_None && bUseNoDataMask)
    5408         139 :             eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    5409             :                                         nChunkYSizeQueried, pabyChunkNodataMask,
    5410             :                                         nWidth, nChunkYSizeQueried, GDT_UInt8,
    5411             :                                         0, 0, nullptr);
    5412             : 
    5413             :         // Special case to promote 1bit data to 8bit 0/255 values.
    5414         677 :         if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
    5415             :         {
    5416           9 :             if (eWrkDataType == GDT_Float32)
    5417             :             {
    5418           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    5419           0 :                 for (size_t i = 0;
    5420           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5421             :                 {
    5422           0 :                     if (pafChunk[i] == 1.0f)
    5423           0 :                         pafChunk[i] = 255.0f;
    5424             :                 }
    5425             :             }
    5426           9 :             else if (eWrkDataType == GDT_UInt8)
    5427             :             {
    5428           9 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    5429      168417 :                 for (size_t i = 0;
    5430      168417 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5431             :                 {
    5432      168408 :                     if (pabyChunk[i] == 1)
    5433      127437 :                         pabyChunk[i] = 255;
    5434             :                 }
    5435             :             }
    5436           0 :             else if (eWrkDataType == GDT_UInt16)
    5437             :             {
    5438           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    5439           0 :                 for (size_t i = 0;
    5440           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5441             :                 {
    5442           0 :                     if (pasChunk[i] == 1)
    5443           0 :                         pasChunk[i] = 255;
    5444             :                 }
    5445             :             }
    5446           0 :             else if (eWrkDataType == GDT_Float64)
    5447             :             {
    5448           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    5449           0 :                 for (size_t i = 0;
    5450           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5451             :                 {
    5452           0 :                     if (padfChunk[i] == 1.0)
    5453           0 :                         padfChunk[i] = 255.0;
    5454             :                 }
    5455             :             }
    5456             :             else
    5457             :             {
    5458           0 :                 CPLAssert(false);
    5459             :             }
    5460             :         }
    5461         668 :         else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
    5462             :         {
    5463           0 :             if (eWrkDataType == GDT_Float32)
    5464             :             {
    5465           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    5466           0 :                 for (size_t i = 0;
    5467           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5468             :                 {
    5469           0 :                     if (pafChunk[i] == 1.0f)
    5470           0 :                         pafChunk[i] = 0.0f;
    5471           0 :                     else if (pafChunk[i] == 0.0f)
    5472           0 :                         pafChunk[i] = 255.0f;
    5473             :                 }
    5474             :             }
    5475           0 :             else if (eWrkDataType == GDT_UInt8)
    5476             :             {
    5477           0 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    5478           0 :                 for (size_t i = 0;
    5479           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5480             :                 {
    5481           0 :                     if (pabyChunk[i] == 1)
    5482           0 :                         pabyChunk[i] = 0;
    5483           0 :                     else if (pabyChunk[i] == 0)
    5484           0 :                         pabyChunk[i] = 255;
    5485             :                 }
    5486             :             }
    5487           0 :             else if (eWrkDataType == GDT_UInt16)
    5488             :             {
    5489           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    5490           0 :                 for (size_t i = 0;
    5491           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5492             :                 {
    5493           0 :                     if (pasChunk[i] == 1)
    5494           0 :                         pasChunk[i] = 0;
    5495           0 :                     else if (pasChunk[i] == 0)
    5496           0 :                         pasChunk[i] = 255;
    5497             :                 }
    5498             :             }
    5499           0 :             else if (eWrkDataType == GDT_Float64)
    5500             :             {
    5501           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    5502           0 :                 for (size_t i = 0;
    5503           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5504             :                 {
    5505           0 :                     if (padfChunk[i] == 1.0)
    5506           0 :                         padfChunk[i] = 0.0;
    5507           0 :                     else if (padfChunk[i] == 0.0)
    5508           0 :                         padfChunk[i] = 255.0;
    5509             :                 }
    5510             :             }
    5511             :             else
    5512             :             {
    5513           0 :                 CPLAssert(false);
    5514             :             }
    5515             :         }
    5516             : 
    5517         677 :         auto pChunkRaw = pChunk;
    5518         677 :         auto pabyChunkNodataMaskRaw = pabyChunkNodataMask;
    5519         677 :         std::shared_ptr<PointerHolder> oSrcBufferHolder;
    5520         677 :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder;
    5521         677 :         if (poJobQueue)
    5522             :         {
    5523           0 :             oSrcBufferHolder = std::make_shared<PointerHolder>(pChunk);
    5524             :             oSrcMaskBufferHolder =
    5525           0 :                 std::make_shared<PointerHolder>(pabyChunkNodataMask);
    5526             :         }
    5527             : 
    5528        1451 :         for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
    5529             :              ++iOverview)
    5530             :         {
    5531         774 :             GDALRasterBand *poDstBand = papoOvrBands[iOverview];
    5532         774 :             const int nDstWidth = poDstBand->GetXSize();
    5533         774 :             const int nDstHeight = poDstBand->GetYSize();
    5534             : 
    5535         774 :             const double dfXRatioDstToSrc =
    5536         774 :                 static_cast<double>(nWidth) / nDstWidth;
    5537         774 :             const double dfYRatioDstToSrc =
    5538         774 :                 static_cast<double>(nHeight) / nDstHeight;
    5539             : 
    5540             :             /* --------------------------------------------------------------------
    5541             :              */
    5542             :             /*      Figure out the line to start writing to, and the first line
    5543             :              */
    5544             :             /*      to not write to.  In theory this approach should ensure that
    5545             :              */
    5546             :             /*      every output line will be written if all input chunks are */
    5547             :             /*      processed. */
    5548             :             /* --------------------------------------------------------------------
    5549             :              */
    5550         774 :             int nDstYOff =
    5551         774 :                 static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
    5552         774 :             if (nDstYOff == nDstHeight)
    5553           0 :                 continue;
    5554         774 :             int nDstYOff2 = static_cast<int>(
    5555         774 :                 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
    5556             : 
    5557         774 :             if (nChunkYOff + nFullResYChunk == nHeight)
    5558         767 :                 nDstYOff2 = nDstHeight;
    5559             : #if DEBUG_VERBOSE
    5560             :             CPLDebug("GDAL",
    5561             :                      "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
    5562             :                      nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
    5563             :                      nDstWidth, nDstYOff2 - nDstYOff);
    5564             : #endif
    5565             : 
    5566        1548 :             auto poJob = std::make_unique<OvrJob>();
    5567         774 :             poJob->pfnResampleFn = pfnResampleFn;
    5568         774 :             poJob->bUseGenericResampleFn = bUseGenericResampleFn;
    5569         774 :             poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
    5570         774 :             poJob->args.nOvrXSize = poDstBand->GetXSize();
    5571         774 :             poJob->args.nOvrYSize = poDstBand->GetYSize();
    5572             :             const char *pszNBITS =
    5573         774 :                 poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    5574         774 :             poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    5575         774 :             poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    5576         774 :             poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    5577         774 :             poJob->args.eWrkDataType = eWrkDataType;
    5578         774 :             poJob->pChunk = pChunkRaw;
    5579         774 :             poJob->args.pabyChunkNodataMask = pabyChunkNodataMaskRaw;
    5580         774 :             poJob->nSrcWidth = nWidth;
    5581         774 :             poJob->nSrcHeight = nHeight;
    5582         774 :             poJob->args.nChunkXOff = 0;
    5583         774 :             poJob->args.nChunkXSize = nWidth;
    5584         774 :             poJob->args.nChunkYOff = nChunkYOffQueried;
    5585         774 :             poJob->args.nChunkYSize = nChunkYSizeQueried;
    5586         774 :             poJob->nDstWidth = nDstWidth;
    5587         774 :             poJob->args.nDstXOff = 0;
    5588         774 :             poJob->args.nDstXOff2 = nDstWidth;
    5589         774 :             poJob->args.nDstYOff = nDstYOff;
    5590         774 :             poJob->args.nDstYOff2 = nDstYOff2;
    5591         774 :             poJob->poDstBand = poDstBand;
    5592         774 :             poJob->args.pszResampling = pszResampling;
    5593         774 :             poJob->args.bHasNoData = bHasNoData;
    5594         774 :             poJob->args.dfNoDataValue = dfNoDataValue;
    5595         774 :             poJob->args.poColorTable = poColorTable;
    5596         774 :             poJob->args.eSrcDataType = eSrcDataType;
    5597         774 :             poJob->args.bPropagateNoData = bPropagateNoData;
    5598             : 
    5599         774 :             if (poJobQueue)
    5600             :             {
    5601           0 :                 poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
    5602           0 :                 poJob->SetSrcBufferHolder(oSrcBufferHolder);
    5603           0 :                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    5604           0 :                 jobList.emplace_back(std::move(poJob));
    5605             :             }
    5606             :             else
    5607             :             {
    5608         774 :                 JobResampleFunc(poJob.get());
    5609         774 :                 eErr = poJob->eErr;
    5610         774 :                 if (eErr == CE_None)
    5611             :                 {
    5612         774 :                     eErr = WriteJobData(poJob.get());
    5613             :                 }
    5614             :             }
    5615             :         }
    5616             :     }
    5617             : 
    5618         672 :     VSIFree(pChunk);
    5619         672 :     VSIFree(pabyChunkNodataMask);
    5620             : 
    5621             :     // Wait for all pending jobs to complete
    5622         672 :     while (!jobList.empty())
    5623             :     {
    5624           0 :         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5625           0 :         if (l_eErr != CE_None && eErr == CE_None)
    5626           0 :             eErr = l_eErr;
    5627             :     }
    5628             : 
    5629             :     /* -------------------------------------------------------------------- */
    5630             :     /*      Renormalized overview mean / stddev if needed.                  */
    5631             :     /* -------------------------------------------------------------------- */
    5632         672 :     if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
    5633             :     {
    5634           0 :         GDALOverviewMagnitudeCorrection(
    5635             :             poSrcBand, nOverviewCount,
    5636             :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
    5637             :             GDALDummyProgress, nullptr);
    5638             :     }
    5639             : 
    5640             :     /* -------------------------------------------------------------------- */
    5641             :     /*      It can be important to flush out data to overviews.             */
    5642             :     /* -------------------------------------------------------------------- */
    5643        1439 :     for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
    5644             :          ++iOverview)
    5645             :     {
    5646         767 :         eErr = papoOvrBands[iOverview]->FlushCache(false);
    5647             :     }
    5648             : 
    5649         672 :     if (eErr == CE_None)
    5650         672 :         pfnProgress(1.0, nullptr, pProgressData);
    5651             : 
    5652         672 :     return eErr;
    5653             : }
    5654             : 
    5655             : /************************************************************************/
    5656             : /*                  GDALRegenerateOverviewsMultiBand()                  */
    5657             : /************************************************************************/
    5658             : 
    5659             : /**
    5660             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5661             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5662             :  *
    5663             :  * This function will generate one or more overview images from a base
    5664             :  * image using the requested downsampling algorithm.  Its primary use
    5665             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5666             :  * can also be used to generate downsampled images in one file from another
    5667             :  * outside the overview architecture.
    5668             :  *
    5669             :  * The output bands need to exist in advance and share the same characteristics
    5670             :  * (type, dimensions)
    5671             :  *
    5672             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5673             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5674             :  *
    5675             :  * It does not support color tables or complex data types.
    5676             :  *
    5677             :  * The pseudo-algorithm used by the function is :
    5678             :  *    for each overview
    5679             :  *       iterate on lines of the source by a step of deltay
    5680             :  *           iterate on columns of the source  by a step of deltax
    5681             :  *               read the source data of size deltax * deltay for all the bands
    5682             :  *               generate the corresponding overview block for all the bands
    5683             :  *
    5684             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5685             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5686             :  * considered as the nodata value and not each value of the triplet
    5687             :  * independently per band.
    5688             :  *
    5689             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    5690             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5691             :  * overview computation.
    5692             :  *
    5693             :  * @param nBands the number of bands, size of papoSrcBands and size of
    5694             :  *               first dimension of papapoOverviewBands
    5695             :  * @param papoSrcBands the list of source bands to downsample
    5696             :  * @param nOverviews the number of downsampled overview levels being generated.
    5697             :  * @param papapoOverviewBands bidimension array of bands. First dimension is
    5698             :  *                            indexed by nBands. Second dimension is indexed by
    5699             :  *                            nOverviews.
    5700             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5701             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5702             :  * @param pfnProgress progress report function.
    5703             :  * @param pProgressData progress function callback data.
    5704             :  * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
    5705             :  *                     key=value pairs, or NULL
    5706             :  *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
    5707             :  *                     options can be specified to express that overviews should
    5708             :  *                     be regenerated only in the specified subset of the source
    5709             :  *                     dataset.
    5710             :  * @return CE_None on success or CE_Failure on failure.
    5711             :  */
    5712             : 
    5713         387 : CPLErr GDALRegenerateOverviewsMultiBand(
    5714             :     int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
    5715             :     GDALRasterBand *const *const *papapoOverviewBands,
    5716             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5717             :     void *pProgressData, CSLConstList papszOptions)
    5718             : {
    5719         387 :     CPL_IGNORE_RET_VAL(papszOptions);
    5720             : 
    5721         387 :     if (pfnProgress == nullptr)
    5722          11 :         pfnProgress = GDALDummyProgress;
    5723             : 
    5724         387 :     if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
    5725           3 :         return CE_None;
    5726             : 
    5727             :     // Sanity checks.
    5728         384 :     if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
    5729         189 :         !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
    5730          80 :         !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
    5731          22 :         !EQUAL(pszResampling, "CUBICSPLINE") &&
    5732          21 :         !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
    5733           5 :         !EQUAL(pszResampling, "MODE"))
    5734             :     {
    5735           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    5736             :                  "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
    5737             :                  "not supported",
    5738             :                  pszResampling);
    5739           0 :         return CE_Failure;
    5740             :     }
    5741             : 
    5742         384 :     int nKernelRadius = 0;
    5743             :     GDALResampleFunction pfnResampleFn =
    5744         384 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    5745         384 :     if (pfnResampleFn == nullptr)
    5746           0 :         return CE_Failure;
    5747             : 
    5748         384 :     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
    5749         384 :     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
    5750         384 :     if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
    5751           0 :         return CE_None;
    5752         384 :     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
    5753       66225 :     for (int iBand = 1; iBand < nBands; ++iBand)
    5754             :     {
    5755      131682 :         if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
    5756       65841 :             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
    5757             :         {
    5758           0 :             CPLError(
    5759             :                 CE_Failure, CPLE_NotSupported,
    5760             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5761             :                 "have the same dimensions");
    5762           0 :             return CE_Failure;
    5763             :         }
    5764       65841 :         if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
    5765             :         {
    5766           0 :             CPLError(
    5767             :                 CE_Failure, CPLE_NotSupported,
    5768             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5769             :                 "have the same data type");
    5770           0 :             return CE_Failure;
    5771             :         }
    5772             :     }
    5773             : 
    5774        1024 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5775             :     {
    5776         640 :         const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
    5777         640 :         const int nDstWidth = poOvrFirstBand->GetXSize();
    5778         640 :         const int nDstHeight = poOvrFirstBand->GetYSize();
    5779       66739 :         for (int iBand = 1; iBand < nBands; ++iBand)
    5780             :         {
    5781       66099 :             const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
    5782      132198 :             if (poOvrBand->GetXSize() != nDstWidth ||
    5783       66099 :                 poOvrBand->GetYSize() != nDstHeight)
    5784             :             {
    5785           0 :                 CPLError(
    5786             :                     CE_Failure, CPLE_NotSupported,
    5787             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5788             :                     "of the same level must have the same dimensions");
    5789           0 :                 return CE_Failure;
    5790             :             }
    5791       66099 :             if (poOvrBand->GetRasterDataType() != eDataType)
    5792             :             {
    5793           0 :                 CPLError(
    5794             :                     CE_Failure, CPLE_NotSupported,
    5795             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5796             :                     "must have the same data type as the source bands");
    5797           0 :                 return CE_Failure;
    5798             :             }
    5799             :         }
    5800             :     }
    5801             : 
    5802             :     // First pass to compute the total number of pixels to write.
    5803         384 :     double dfTotalPixelCount = 0;
    5804         384 :     const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
    5805         384 :     const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
    5806         384 :     const int nSrcXSize = atoi(CSLFetchNameValueDef(
    5807             :         papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
    5808         384 :     const int nSrcYSize = atoi(CSLFetchNameValueDef(
    5809             :         papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
    5810        1024 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5811             :     {
    5812         640 :         dfTotalPixelCount +=
    5813        1280 :             static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5814         640 :             papapoOverviewBands[0][iOverview]->GetXSize() *
    5815        1280 :             static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5816         640 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5817             :     }
    5818             : 
    5819             :     const GDALDataType eWrkDataType =
    5820         384 :         GDALGetOvrWorkDataType(pszResampling, eDataType);
    5821             :     const int nWrkDataTypeSize =
    5822         384 :         std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
    5823             : 
    5824         384 :     const bool bIsMask = papoSrcBands[0]->IsMaskBand();
    5825             : 
    5826             :     // If we have a nodata mask and we are doing something more complicated
    5827             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5828             :     const bool bUseNoDataMask =
    5829         567 :         !STARTS_WITH_CI(pszResampling, "NEAR") &&
    5830         183 :         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
    5831             : 
    5832         768 :     std::vector<bool> abHasNoData(nBands);
    5833         768 :     std::vector<double> adfNoDataValue(nBands);
    5834             : 
    5835       66609 :     for (int iBand = 0; iBand < nBands; ++iBand)
    5836             :     {
    5837       66225 :         int nHasNoData = 0;
    5838      132450 :         adfNoDataValue[iBand] =
    5839       66225 :             papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
    5840       66225 :         abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
    5841             :     }
    5842             : 
    5843         768 :     std::string osDetailMessage;
    5844         436 :     if (bUseNoDataMask &&
    5845          52 :         papoSrcBands[0]->HasConflictingMaskSources(&osDetailMessage, false))
    5846             :     {
    5847           9 :         CPLError(CE_Warning, CPLE_AppDefined, "%s%s", osDetailMessage.c_str(),
    5848          18 :                  abHasNoData[0]
    5849             :                      ? "Only the nodata value will be taken into account."
    5850           9 :                      : "Only the first listed one will be taken into account.");
    5851             :     }
    5852             : 
    5853             :     const bool bPropagateNoData =
    5854         384 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5855             : 
    5856         384 :     const int nThreads = GDALGetNumThreads(GDAL_DEFAULT_MAX_THREAD_COUNT,
    5857             :                                            /* bDefaultToAllCPUs=*/false);
    5858             :     auto poThreadPool =
    5859         384 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5860             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5861         768 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5862             : 
    5863             :     // Only configurable for debug / testing
    5864         384 :     const GIntBig nChunkMaxSize = []() -> GIntBig
    5865             :     {
    5866             :         const char *pszVal =
    5867         384 :             CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
    5868         384 :         if (pszVal)
    5869             :         {
    5870          15 :             GIntBig nRet = 0;
    5871          15 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5872          15 :             return std::max<GIntBig>(100, nRet);
    5873             :         }
    5874         369 :         return 10 * 1024 * 1024;
    5875         384 :     }();
    5876             : 
    5877             :     // Only configurable for debug / testing
    5878         384 :     const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
    5879             :     {
    5880         384 :         const char *pszVal = CPLGetConfigOption(
    5881             :             "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
    5882         384 :         if (pszVal)
    5883             :         {
    5884          14 :             GIntBig nRet = 0;
    5885          14 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5886          14 :             return std::max<GIntBig>(100, nRet);
    5887             :         }
    5888         370 :         const auto nUsableRAM = CPLGetUsablePhysicalRAM();
    5889         370 :         if (nUsableRAM > 0)
    5890         370 :             return nUsableRAM / 10;
    5891             :         // Select a value to be able to at least downsample by 2 for a RGB
    5892             :         // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
    5893           0 :         return 100 * 1024 * 1024;
    5894         384 :     }();
    5895             : 
    5896             :     // Second pass to do the real job.
    5897         384 :     double dfCurPixelCount = 0;
    5898         384 :     CPLErr eErr = CE_None;
    5899        1018 :     for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
    5900             :          ++iOverview)
    5901             :     {
    5902         639 :         int iSrcOverview = -1;  // -1 means the source bands.
    5903             : 
    5904             :         const int nDstTotalWidth =
    5905         639 :             papapoOverviewBands[0][iOverview]->GetXSize();
    5906             :         const int nDstTotalHeight =
    5907         639 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5908             : 
    5909             :         // Compute the coordinates of the target region to refresh
    5910         639 :         constexpr double EPS = 1e-8;
    5911         639 :         const int nDstXOffStart = static_cast<int>(
    5912         639 :             static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
    5913             :             EPS);
    5914             :         const int nDstXOffEnd =
    5915        1278 :             std::min(static_cast<int>(
    5916         639 :                          std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
    5917         639 :                                        nToplevelSrcWidth * nDstTotalWidth -
    5918             :                                    EPS)),
    5919         639 :                      nDstTotalWidth);
    5920         639 :         const int nDstWidth = nDstXOffEnd - nDstXOffStart;
    5921         639 :         const int nDstYOffStart =
    5922         639 :             static_cast<int>(static_cast<double>(nSrcYOff) /
    5923         639 :                                  nToplevelSrcHeight * nDstTotalHeight +
    5924             :                              EPS);
    5925             :         const int nDstYOffEnd =
    5926        1278 :             std::min(static_cast<int>(
    5927         639 :                          std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
    5928         639 :                                        nToplevelSrcHeight * nDstTotalHeight -
    5929             :                                    EPS)),
    5930         639 :                      nDstTotalHeight);
    5931         639 :         const int nDstHeight = nDstYOffEnd - nDstYOffStart;
    5932             : 
    5933             :         // Try to use previous level of overview as the source to compute
    5934             :         // the next level.
    5935         639 :         int nSrcWidth = nToplevelSrcWidth;
    5936         639 :         int nSrcHeight = nToplevelSrcHeight;
    5937         894 :         if (iOverview > 0 &&
    5938         255 :             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
    5939             :         {
    5940         247 :             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
    5941         247 :             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
    5942         247 :             iSrcOverview = iOverview - 1;
    5943             :         }
    5944             : 
    5945         639 :         const double dfXRatioDstToSrc =
    5946         639 :             static_cast<double>(nSrcWidth) / nDstTotalWidth;
    5947         639 :         const double dfYRatioDstToSrc =
    5948         639 :             static_cast<double>(nSrcHeight) / nDstTotalHeight;
    5949             : 
    5950             :         const int nOvrFactor =
    5951        1917 :             std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    5952         639 :                                  static_cast<int>(0.5 + dfYRatioDstToSrc)));
    5953             : 
    5954         639 :         int nDstChunkXSize = 0;
    5955         639 :         int nDstChunkYSize = 0;
    5956         639 :         papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
    5957             :                                                         &nDstChunkYSize);
    5958             : 
    5959         639 :         constexpr int PIXEL_MARGIN = 2;
    5960             :         // Try to extend the chunk size so that the memory needed to acquire
    5961             :         // source pixels goes up to 10 MB.
    5962             :         // This can help for drivers that support multi-threaded reading
    5963         639 :         const int nFullResYChunk = static_cast<int>(std::min<double>(
    5964         639 :             nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
    5965         639 :         const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
    5966        1278 :             nSrcHeight,
    5967        1278 :             nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5968         639 :                                  nKernelRadius * nOvrFactor));
    5969         872 :         while (nDstChunkXSize < nDstWidth)
    5970             :         {
    5971         252 :             constexpr int INCREASE_FACTOR = 2;
    5972             : 
    5973         252 :             const int nFullResXChunk = static_cast<int>(std::min<double>(
    5974         504 :                 nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
    5975         252 :                                               dfXRatioDstToSrc));
    5976             : 
    5977             :             const int nFullResXChunkQueried =
    5978         252 :                 static_cast<int>(std::min<int64_t>(
    5979         504 :                     nSrcWidth,
    5980         504 :                     nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5981         252 :                                          nKernelRadius * nOvrFactor));
    5982             : 
    5983         252 :             if (nBands > nChunkMaxSize / nFullResXChunkQueried /
    5984         252 :                              nFullResYChunkQueried / nWrkDataTypeSize)
    5985             :             {
    5986          19 :                 break;
    5987             :             }
    5988             : 
    5989         233 :             nDstChunkXSize *= INCREASE_FACTOR;
    5990             :         }
    5991         639 :         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
    5992             : 
    5993         639 :         const int nFullResXChunk = static_cast<int>(std::min<double>(
    5994         639 :             nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
    5995         639 :         const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
    5996        1278 :             nSrcWidth,
    5997        1278 :             nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5998         639 :                                  nKernelRadius * nOvrFactor));
    5999             : 
    6000             :         // Make sure that the RAM requirements to acquire the source data does
    6001             :         // not exceed nChunkMaxSizeForTempFile
    6002             :         // If so, reduce the destination chunk size, generate overviews in a
    6003             :         // temporary dataset, and copy that temporary dataset over the target
    6004             :         // overview bands (to avoid issues with lossy compression)
    6005             :         const bool bOverflowFullResXChunkYChunkQueried =
    6006         639 :             nBands > std::numeric_limits<int64_t>::max() /
    6007         639 :                          nFullResXChunkQueried / nFullResYChunkQueried /
    6008         639 :                          nWrkDataTypeSize;
    6009             : 
    6010         639 :         const auto nMemRequirement =
    6011             :             bOverflowFullResXChunkYChunkQueried
    6012         639 :                 ? 0
    6013         635 :                 : static_cast<GIntBig>(nFullResXChunkQueried) *
    6014         635 :                       nFullResYChunkQueried * nBands * nWrkDataTypeSize;
    6015             :         // Use a temporary dataset with a smaller destination chunk size
    6016         639 :         const auto nOverShootFactor =
    6017             :             nMemRequirement / nChunkMaxSizeForTempFile;
    6018             : 
    6019         639 :         constexpr int MIN_OVERSHOOT_FACTOR = 4;
    6020             :         const auto nSqrtOverShootFactor = std::max<GIntBig>(
    6021        1278 :             MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
    6022         639 :                                       static_cast<double>(nOverShootFactor)))));
    6023         639 :         constexpr int DEFAULT_CHUNK_SIZE = 256;
    6024         639 :         constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
    6025             :         const int nReducedDstChunkXSize =
    6026             :             bOverflowFullResXChunkYChunkQueried
    6027        1274 :                 ? DEFAULT_CHUNK_SIZE
    6028        1274 :                 : std::max(1, static_cast<int>(nDstChunkXSize /
    6029        1274 :                                                nSqrtOverShootFactor) &
    6030         635 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    6031             :         const int nReducedDstChunkYSize =
    6032             :             bOverflowFullResXChunkYChunkQueried
    6033        1274 :                 ? DEFAULT_CHUNK_SIZE
    6034        1274 :                 : std::max(1, static_cast<int>(nDstChunkYSize /
    6035        1274 :                                                nSqrtOverShootFactor) &
    6036         635 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    6037             : 
    6038         639 :         if (bOverflowFullResXChunkYChunkQueried ||
    6039             :             nMemRequirement > nChunkMaxSizeForTempFile)
    6040             :         {
    6041             :             const auto nDTSize =
    6042          43 :                 std::max(1, GDALGetDataTypeSizeBytes(eDataType));
    6043             :             const bool bTmpDSMemRequirementOverflow =
    6044          43 :                 nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
    6045          43 :                              nDstHeight / nDTSize;
    6046          43 :             const auto nTmpDSMemRequirement =
    6047             :                 bTmpDSMemRequirementOverflow
    6048          43 :                     ? 0
    6049          41 :                     : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
    6050          41 :                           nDTSize;
    6051             : 
    6052             :             // make sure that one band buffer doesn't overflow size_t
    6053             :             const bool bChunkSizeOverflow =
    6054          43 :                 static_cast<size_t>(nDTSize) >
    6055          43 :                 std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
    6056          43 :             const size_t nChunkSize =
    6057             :                 bChunkSizeOverflow
    6058          43 :                     ? 0
    6059          41 :                     : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
    6060             : 
    6061             :             const auto CreateVRT =
    6062          41 :                 [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
    6063             :                  pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
    6064             :                  iSrcOverview, &abHasNoData,
    6065      393585 :                  &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
    6066             :             {
    6067             :                 auto poVRTDS = std::make_unique<VRTDataset>(
    6068          41 :                     nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
    6069          41 :                     nVRTBlockYSize);
    6070             : 
    6071       65620 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    6072             :                 {
    6073      131158 :                     auto poVRTSrc = std::make_unique<VRTSimpleSource>();
    6074       65579 :                     poVRTSrc->SetResampling(pszResampling);
    6075       65579 :                     poVRTDS->AddBand(eWrkDataType);
    6076             :                     auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
    6077       65579 :                         poVRTDS->GetRasterBand(iBand + 1));
    6078             : 
    6079       65579 :                     auto poSrcBand = papoSrcBands[iBand];
    6080       65579 :                     if (iSrcOverview != -1)
    6081          24 :                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
    6082       65579 :                     poVRTBand->ConfigureSource(
    6083             :                         poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
    6084             :                         nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
    6085             :                     // Add the source to the band
    6086       65579 :                     poVRTBand->AddSource(poVRTSrc.release());
    6087       65579 :                     if (abHasNoData[iBand])
    6088           3 :                         poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
    6089             :                 }
    6090             : 
    6091          42 :                 if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
    6092           1 :                     poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
    6093             :                 {
    6094             :                     VRTSourcedRasterBand *poMaskVRTBand =
    6095           1 :                         cpl::down_cast<VRTSourcedRasterBand *>(
    6096           1 :                             poVRTDS->GetRasterBand(1)->GetMaskBand());
    6097           1 :                     auto poSrcBand = papoSrcBands[0];
    6098           1 :                     if (iSrcOverview != -1)
    6099           0 :                         poSrcBand = papapoOverviewBands[0][iSrcOverview];
    6100           1 :                     poMaskVRTBand->AddMaskBandSource(
    6101           1 :                         poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
    6102             :                         0, 0, nDstTotalWidth, nDstTotalHeight);
    6103             :                 }
    6104             : 
    6105          41 :                 return poVRTDS;
    6106          43 :             };
    6107             : 
    6108             :             // If the overview accommodates chunking, do so and recurse
    6109             :             // to avoid generating full size temporary files
    6110          43 :             if (!bOverflowFullResXChunkYChunkQueried &&
    6111          39 :                 !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
    6112          39 :                 (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
    6113             :             {
    6114             :                 // Create a VRT with the smaller chunk to do the scaling
    6115             :                 auto poVRTDS =
    6116          13 :                     CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    6117             : 
    6118          13 :                 std::vector<GDALRasterBand *> apoVRTBand(nBands);
    6119          13 :                 std::vector<GDALRasterBand *> apoDstBand(nBands);
    6120       65560 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    6121             :                 {
    6122       65547 :                     apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
    6123       65547 :                     apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
    6124             :                 }
    6125             : 
    6126             :                 // Use a flag to avoid reading from the overview being built
    6127             :                 GDALRasterIOExtraArg sExtraArg;
    6128          13 :                 INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    6129          13 :                 if (iSrcOverview == -1)
    6130          13 :                     sExtraArg.bUseOnlyThisScale = true;
    6131             : 
    6132             :                 // A single band buffer for data transfer to the overview
    6133          13 :                 std::vector<GByte> abyChunk;
    6134             :                 try
    6135             :                 {
    6136          13 :                     abyChunk.resize(nChunkSize);
    6137             :                 }
    6138           0 :                 catch (const std::exception &)
    6139             :                 {
    6140           0 :                     CPLError(CE_Failure, CPLE_OutOfMemory,
    6141             :                              "Out of memory allocating temporary buffer");
    6142           0 :                     return CE_Failure;
    6143             :                 }
    6144             : 
    6145             :                 // Loop over output height, in chunks
    6146          13 :                 for (int nDstYOff = nDstYOffStart;
    6147          38 :                      nDstYOff < nDstYOffEnd && eErr == CE_None;
    6148             :                      /* */)
    6149             :                 {
    6150             :                     const int nDstYCount =
    6151          25 :                         std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    6152             :                     // Loop over output width, in output chunks
    6153          25 :                     for (int nDstXOff = nDstXOffStart;
    6154          74 :                          nDstXOff < nDstXOffEnd && eErr == CE_None;
    6155             :                          /* */)
    6156             :                     {
    6157             :                         const int nDstXCount =
    6158          49 :                             std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    6159             :                         // Read and transfer the chunk to the overview
    6160          98 :                         for (int iBand = 0; iBand < nBands && eErr == CE_None;
    6161             :                              ++iBand)
    6162             :                         {
    6163          98 :                             eErr = apoVRTBand[iBand]->RasterIO(
    6164             :                                 GF_Read, nDstXOff, nDstYOff, nDstXCount,
    6165          49 :                                 nDstYCount, abyChunk.data(), nDstXCount,
    6166             :                                 nDstYCount, eDataType, 0, 0, &sExtraArg);
    6167          49 :                             if (eErr == CE_None)
    6168             :                             {
    6169          96 :                                 eErr = apoDstBand[iBand]->RasterIO(
    6170             :                                     GF_Write, nDstXOff, nDstYOff, nDstXCount,
    6171          48 :                                     nDstYCount, abyChunk.data(), nDstXCount,
    6172             :                                     nDstYCount, eDataType, 0, 0, nullptr);
    6173             :                             }
    6174             :                         }
    6175             : 
    6176          49 :                         dfCurPixelCount +=
    6177          49 :                             static_cast<double>(nDstXCount) * nDstYCount;
    6178             : 
    6179          49 :                         nDstXOff += nDstXCount;
    6180             :                     }  // width
    6181             : 
    6182          25 :                     if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
    6183             :                                      nullptr, pProgressData))
    6184             :                     {
    6185           0 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    6186             :                                  "User terminated");
    6187           0 :                         eErr = CE_Failure;
    6188             :                     }
    6189             : 
    6190          25 :                     nDstYOff += nDstYCount;
    6191             :                 }  // height
    6192             : 
    6193          13 :                 if (CE_None != eErr)
    6194             :                 {
    6195           1 :                     CPLError(CE_Failure, CPLE_AppDefined,
    6196             :                              "Error while writing overview");
    6197           1 :                     return CE_Failure;
    6198             :                 }
    6199             : 
    6200          12 :                 pfnProgress(1.0, nullptr, pProgressData);
    6201             :                 // Flush the overviews we just generated
    6202          24 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    6203          12 :                     apoDstBand[iBand]->FlushCache(false);
    6204             : 
    6205          12 :                 continue;  // Next overview
    6206             :             }  // chunking via temporary dataset
    6207             : 
    6208           0 :             std::unique_ptr<GDALDataset> poTmpDS;
    6209             :             // Config option mostly/only for autotest purposes
    6210             :             const char *pszGDAL_OVR_TEMP_DRIVER =
    6211          30 :                 CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
    6212          30 :             if ((!bTmpDSMemRequirementOverflow &&
    6213           4 :                  nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
    6214           4 :                  !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
    6215          26 :                 EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
    6216             :             {
    6217          10 :                 auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
    6218          10 :                 if (!poTmpDrv)
    6219             :                 {
    6220           0 :                     eErr = CE_Failure;
    6221           0 :                     break;
    6222             :                 }
    6223          10 :                 poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
    6224             :                                                nDstTotalHeight, nBands,
    6225          10 :                                                eDataType, nullptr));
    6226             :             }
    6227             :             else
    6228             :             {
    6229             :                 // Create a temporary file for the overview
    6230             :                 auto poTmpDrv =
    6231          20 :                     GetGDALDriverManager()->GetDriverByName("GTiff");
    6232          20 :                 if (!poTmpDrv)
    6233             :                 {
    6234           0 :                     eErr = CE_Failure;
    6235           0 :                     break;
    6236             :                 }
    6237          40 :                 std::string osTmpFilename;
    6238          20 :                 auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
    6239          20 :                 if (poDstDS)
    6240             :                 {
    6241          20 :                     osTmpFilename = poDstDS->GetDescription();
    6242             :                     VSIStatBufL sStatBuf;
    6243          20 :                     if (!osTmpFilename.empty() &&
    6244           0 :                         VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
    6245           0 :                         osTmpFilename += "_tmp_ovr.tif";
    6246             :                 }
    6247          20 :                 if (osTmpFilename.empty())
    6248             :                 {
    6249          20 :                     osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
    6250          20 :                     osTmpFilename += ".tif";
    6251             :                 }
    6252          20 :                 CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
    6253             :                          osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
    6254          40 :                 CPLStringList aosCO;
    6255          20 :                 if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
    6256          20 :                           (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
    6257             :                 {
    6258          14 :                     aosCO.SetNameValue("TILED", "YES");
    6259             :                     aosCO.SetNameValue("BLOCKXSIZE",
    6260          14 :                                        CPLSPrintf("%d", nReducedDstChunkXSize));
    6261             :                     aosCO.SetNameValue("BLOCKYSIZE",
    6262          14 :                                        CPLSPrintf("%d", nReducedDstChunkYSize));
    6263             :                 }
    6264          20 :                 if (const char *pszCOList =
    6265          20 :                         poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
    6266             :                 {
    6267             :                     aosCO.SetNameValue(
    6268          20 :                         "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
    6269             :                 }
    6270          20 :                 poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
    6271             :                                                nDstHeight, nBands, eDataType,
    6272          20 :                                                aosCO.List()));
    6273          20 :                 if (poTmpDS)
    6274             :                 {
    6275          18 :                     poTmpDS->MarkSuppressOnClose();
    6276          18 :                     VSIUnlink(osTmpFilename.c_str());
    6277             :                 }
    6278             :             }
    6279          30 :             if (!poTmpDS)
    6280             :             {
    6281           2 :                 eErr = CE_Failure;
    6282           2 :                 break;
    6283             :             }
    6284             : 
    6285             :             // Create a full size VRT to do the resampling without edge effects
    6286             :             auto poVRTDS =
    6287          28 :                 CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    6288             : 
    6289             :             // Allocate a band buffer with the overview chunk size
    6290             :             std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
    6291             :                 VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
    6292          28 :                                     nDstChunkYSize));
    6293          28 :             if (pDstBuffer == nullptr)
    6294             :             {
    6295           0 :                 eErr = CE_Failure;
    6296           0 :                 break;
    6297             :             }
    6298             : 
    6299             :             // Use a flag to avoid reading the overview being built
    6300             :             GDALRasterIOExtraArg sExtraArg;
    6301          28 :             INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    6302          28 :             if (iSrcOverview == -1)
    6303           4 :                 sExtraArg.bUseOnlyThisScale = true;
    6304             : 
    6305             :             // Scale and copy data from the VRT to the temp file
    6306          28 :             for (int nDstYOff = nDstYOffStart;
    6307         914 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    6308             :                  /* */)
    6309             :             {
    6310             :                 const int nDstYCount =
    6311         886 :                     std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
    6312         886 :                 for (int nDstXOff = nDstXOffStart;
    6313      201218 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    6314             :                      /* */)
    6315             :                 {
    6316             :                     const int nDstXCount =
    6317      200332 :                         std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
    6318      400668 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    6319             :                          ++iBand)
    6320             :                     {
    6321      200336 :                         auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
    6322      200336 :                         eErr = poSrcBand->RasterIO(
    6323             :                             GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
    6324             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    6325             :                             eWrkDataType, 0, 0, &sExtraArg);
    6326      200336 :                         if (eErr == CE_None)
    6327             :                         {
    6328             :                             // Write to the temporary dataset, shifted
    6329      200334 :                             auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
    6330      200334 :                             eErr = poOvrBand->RasterIO(
    6331             :                                 GF_Write, nDstXOff - nDstXOffStart,
    6332             :                                 nDstYOff - nDstYOffStart, nDstXCount,
    6333             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    6334             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    6335             :                         }
    6336             :                     }
    6337      200332 :                     nDstXOff += nDstXCount;
    6338             :                 }
    6339         886 :                 nDstYOff += nDstYCount;
    6340             :             }
    6341             : 
    6342             :             // Copy from the temporary to the overview
    6343          28 :             for (int nDstYOff = nDstYOffStart;
    6344          54 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    6345             :                  /* */)
    6346             :             {
    6347             :                 const int nDstYCount =
    6348          26 :                     std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    6349          26 :                 for (int nDstXOff = nDstXOffStart;
    6350          52 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    6351             :                      /* */)
    6352             :                 {
    6353             :                     const int nDstXCount =
    6354          26 :                         std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    6355          56 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    6356             :                          ++iBand)
    6357             :                     {
    6358          30 :                         auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
    6359          30 :                         eErr = poSrcBand->RasterIO(
    6360             :                             GF_Read, nDstXOff - nDstXOffStart,
    6361             :                             nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
    6362             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    6363             :                             eWrkDataType, 0, 0, nullptr);
    6364          30 :                         if (eErr == CE_None)
    6365             :                         {
    6366             :                             // Write to the destination overview bands
    6367          30 :                             auto poOvrBand =
    6368          30 :                                 papapoOverviewBands[iBand][iOverview];
    6369          30 :                             eErr = poOvrBand->RasterIO(
    6370             :                                 GF_Write, nDstXOff, nDstYOff, nDstXCount,
    6371             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    6372             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    6373             :                         }
    6374             :                     }
    6375          26 :                     nDstXOff += nDstXCount;
    6376             :                 }
    6377          26 :                 nDstYOff += nDstYCount;
    6378             :             }
    6379             : 
    6380          28 :             if (eErr != CE_None)
    6381             :             {
    6382           2 :                 CPLError(CE_Failure, CPLE_AppDefined,
    6383             :                          "Failed to write overview %d", iOverview);
    6384           2 :                 return eErr;
    6385             :             }
    6386             : 
    6387             :             // Flush the data to overviews.
    6388          56 :             for (int iBand = 0; iBand < nBands; ++iBand)
    6389          30 :                 papapoOverviewBands[iBand][iOverview]->FlushCache(false);
    6390             : 
    6391          26 :             continue;
    6392             :         }
    6393             : 
    6394             :         // Structure describing a resampling job
    6395             :         struct OvrJob
    6396             :         {
    6397             :             // Buffers to free when job is finished
    6398             :             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
    6399             :             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
    6400             :             std::unique_ptr<PointerHolder> oDstBufferHolder{};
    6401             : 
    6402             :             GDALRasterBand *poDstBand = nullptr;
    6403             : 
    6404             :             // Input parameters of pfnResampleFn
    6405             :             GDALResampleFunction pfnResampleFn = nullptr;
    6406             :             GDALOverviewResampleArgs args{};
    6407             :             const void *pChunk = nullptr;
    6408             : 
    6409             :             // Output values of resampling function
    6410             :             CPLErr eErr = CE_Failure;
    6411             :             void *pDstBuffer = nullptr;
    6412             :             GDALDataType eDstBufferDataType = GDT_Unknown;
    6413             : 
    6414        3280 :             void NotifyFinished()
    6415             :             {
    6416        6560 :                 std::lock_guard guard(mutex);
    6417        3280 :                 bFinished = true;
    6418        3280 :                 cv.notify_one();
    6419        3280 :             }
    6420             : 
    6421           2 :             bool IsFinished()
    6422             :             {
    6423           2 :                 std::lock_guard guard(mutex);
    6424           4 :                 return bFinished;
    6425             :             }
    6426             : 
    6427          14 :             void WaitFinished()
    6428             :             {
    6429          28 :                 std::unique_lock oGuard(mutex);
    6430          22 :                 while (!bFinished)
    6431             :                 {
    6432           8 :                     cv.wait(oGuard);
    6433             :                 }
    6434          14 :             }
    6435             : 
    6436             :           private:
    6437             :             // Synchronization
    6438             :             bool bFinished = false;
    6439             :             std::mutex mutex{};
    6440             :             std::condition_variable cv{};
    6441             :         };
    6442             : 
    6443             :         // Thread function to resample
    6444        3280 :         const auto JobResampleFunc = [](void *pData)
    6445             :         {
    6446        3280 :             OvrJob *poJob = static_cast<OvrJob *>(pData);
    6447             : 
    6448        3280 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    6449             :                                                &(poJob->pDstBuffer),
    6450             :                                                &(poJob->eDstBufferDataType));
    6451             : 
    6452        3280 :             auto pDstBuffer = poJob->pDstBuffer;
    6453             :             poJob->oDstBufferHolder =
    6454        3280 :                 std::make_unique<PointerHolder>(pDstBuffer);
    6455             : 
    6456        3280 :             poJob->NotifyFinished();
    6457        3280 :         };
    6458             : 
    6459             :         // Function to write resample data to target band
    6460        3280 :         const auto WriteJobData = [](const OvrJob *poJob)
    6461             :         {
    6462        6560 :             return poJob->poDstBand->RasterIO(
    6463        3280 :                 GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
    6464        3280 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    6465        3280 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    6466        3280 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    6467        3280 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    6468        3280 :                 poJob->eDstBufferDataType, 0, 0, nullptr);
    6469             :         };
    6470             : 
    6471             :         // Wait for completion of oldest job and serialize it
    6472             :         const auto WaitAndFinalizeOldestJob =
    6473          14 :             [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    6474             :         {
    6475          14 :             auto poOldestJob = jobList.front().get();
    6476          14 :             poOldestJob->WaitFinished();
    6477          14 :             CPLErr l_eErr = poOldestJob->eErr;
    6478          14 :             if (l_eErr == CE_None)
    6479             :             {
    6480          14 :                 l_eErr = WriteJobData(poOldestJob);
    6481             :             }
    6482             : 
    6483          14 :             jobList.pop_front();
    6484          14 :             return l_eErr;
    6485             :         };
    6486             : 
    6487             :         // Queue of jobs
    6488        1192 :         std::list<std::unique_ptr<OvrJob>> jobList;
    6489             : 
    6490        1192 :         std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
    6491             :         std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
    6492        1192 :             apabyChunkNoDataMask(nBands);
    6493             : 
    6494             :         // Iterate on destination overview, block by block.
    6495         596 :         for (int nDstYOff = nDstYOffStart;
    6496        2097 :              nDstYOff < nDstYOffEnd && eErr == CE_None;
    6497        1501 :              nDstYOff += nDstChunkYSize)
    6498             :         {
    6499             :             int nDstYCount;
    6500        1501 :             if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
    6501        1082 :                 nDstYCount = nDstChunkYSize;
    6502             :             else
    6503         419 :                 nDstYCount = nDstYOffEnd - nDstYOff;
    6504             : 
    6505        1501 :             int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    6506        1501 :             int nChunkYOff2 = static_cast<int>(
    6507        1501 :                 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
    6508        1501 :             if (nChunkYOff2 > nSrcHeight ||
    6509        1501 :                 nDstYOff + nDstYCount == nDstTotalHeight)
    6510         589 :                 nChunkYOff2 = nSrcHeight;
    6511        1501 :             int nYCount = nChunkYOff2 - nChunkYOff;
    6512        1501 :             CPLAssert(nYCount <= nFullResYChunk);
    6513             : 
    6514        1501 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    6515        1501 :             int nChunkYSizeQueried =
    6516        1501 :                 nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    6517        1501 :             if (nChunkYOffQueried < 0)
    6518             :             {
    6519         141 :                 nChunkYSizeQueried += nChunkYOffQueried;
    6520         141 :                 nChunkYOffQueried = 0;
    6521             :             }
    6522        1501 :             if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
    6523         140 :                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
    6524        1501 :             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
    6525             : 
    6526        1501 :             if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
    6527             :                              nullptr, pProgressData))
    6528             :             {
    6529           1 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6530           1 :                 eErr = CE_Failure;
    6531             :             }
    6532             : 
    6533             :             // Iterate on destination overview, block by block.
    6534        1501 :             for (int nDstXOff = nDstXOffStart;
    6535        3041 :                  nDstXOff < nDstXOffEnd && eErr == CE_None;
    6536        1540 :                  nDstXOff += nDstChunkXSize)
    6537             :             {
    6538        1540 :                 int nDstXCount = 0;
    6539        1540 :                 if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
    6540        1523 :                     nDstXCount = nDstChunkXSize;
    6541             :                 else
    6542          17 :                     nDstXCount = nDstXOffEnd - nDstXOff;
    6543             : 
    6544        1540 :                 dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
    6545             : 
    6546        1540 :                 int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    6547        1540 :                 int nChunkXOff2 = static_cast<int>(
    6548        1540 :                     ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    6549        1540 :                 if (nChunkXOff2 > nSrcWidth ||
    6550        1540 :                     nDstXOff + nDstXCount == nDstTotalWidth)
    6551        1465 :                     nChunkXOff2 = nSrcWidth;
    6552        1540 :                 const int nXCount = nChunkXOff2 - nChunkXOff;
    6553        1540 :                 CPLAssert(nXCount <= nFullResXChunk);
    6554             : 
    6555        1540 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    6556        1540 :                 int nChunkXSizeQueried =
    6557        1540 :                     nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    6558        1540 :                 if (nChunkXOffQueried < 0)
    6559             :                 {
    6560         201 :                     nChunkXSizeQueried += nChunkXOffQueried;
    6561         201 :                     nChunkXOffQueried = 0;
    6562             :                 }
    6563        1540 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
    6564         210 :                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
    6565        1540 :                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
    6566             : #if DEBUG_VERBOSE
    6567             :                 CPLDebug("GDAL",
    6568             :                          "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
    6569             :                          nChunkXOffQueried, nChunkYOffQueried,
    6570             :                          nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
    6571             :                          nDstYOff, nDstXCount, nDstYCount);
    6572             : #endif
    6573             : 
    6574             :                 // Avoid accumulating too many tasks and exhaust RAM
    6575             : 
    6576             :                 // Try to complete already finished jobs
    6577        1542 :                 while (eErr == CE_None && !jobList.empty())
    6578             :                 {
    6579           2 :                     auto poOldestJob = jobList.front().get();
    6580           2 :                     if (!poOldestJob->IsFinished())
    6581           0 :                         break;
    6582           2 :                     eErr = poOldestJob->eErr;
    6583           2 :                     if (eErr == CE_None)
    6584             :                     {
    6585           2 :                         eErr = WriteJobData(poOldestJob);
    6586             :                     }
    6587             : 
    6588           2 :                     jobList.pop_front();
    6589             :                 }
    6590             : 
    6591             :                 // And in case we have saturated the number of threads,
    6592             :                 // wait for completion of tasks to go below the threshold.
    6593        3080 :                 while (eErr == CE_None &&
    6594        1540 :                        jobList.size() >= static_cast<size_t>(nThreads))
    6595             :                 {
    6596           0 :                     eErr = WaitAndFinalizeOldestJob(jobList);
    6597             :                 }
    6598             : 
    6599             :                 // Read the source buffers for all the bands.
    6600        4821 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6601             :                 {
    6602             :                     // (Re)allocate buffers if needed
    6603        3281 :                     if (apaChunk[iBand] == nullptr)
    6604             :                     {
    6605        1159 :                         apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
    6606             :                             nFullResXChunkQueried, nFullResYChunkQueried,
    6607             :                             nWrkDataTypeSize));
    6608        1159 :                         if (apaChunk[iBand] == nullptr)
    6609             :                         {
    6610           0 :                             eErr = CE_Failure;
    6611             :                         }
    6612             :                     }
    6613        3598 :                     if (bUseNoDataMask &&
    6614         317 :                         apabyChunkNoDataMask[iBand] == nullptr)
    6615             :                     {
    6616         266 :                         apabyChunkNoDataMask[iBand].reset(
    6617         266 :                             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    6618             :                                 nFullResXChunkQueried, nFullResYChunkQueried)));
    6619         266 :                         if (apabyChunkNoDataMask[iBand] == nullptr)
    6620             :                         {
    6621           0 :                             eErr = CE_Failure;
    6622             :                         }
    6623             :                     }
    6624             : 
    6625        3281 :                     if (eErr == CE_None)
    6626             :                     {
    6627        3281 :                         GDALRasterBand *poSrcBand = nullptr;
    6628        3281 :                         if (iSrcOverview == -1)
    6629        2391 :                             poSrcBand = papoSrcBands[iBand];
    6630             :                         else
    6631         890 :                             poSrcBand =
    6632         890 :                                 papapoOverviewBands[iBand][iSrcOverview];
    6633        3281 :                         eErr = poSrcBand->RasterIO(
    6634             :                             GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6635             :                             nChunkXSizeQueried, nChunkYSizeQueried,
    6636        3281 :                             apaChunk[iBand].get(), nChunkXSizeQueried,
    6637             :                             nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
    6638             : 
    6639        3281 :                         if (bUseNoDataMask && eErr == CE_None)
    6640             :                         {
    6641         317 :                             auto poMaskBand = poSrcBand->IsMaskBand()
    6642         317 :                                                   ? poSrcBand
    6643         244 :                                                   : poSrcBand->GetMaskBand();
    6644         317 :                             eErr = poMaskBand->RasterIO(
    6645             :                                 GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6646             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6647         317 :                                 apabyChunkNoDataMask[iBand].get(),
    6648             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6649             :                                 GDT_UInt8, 0, 0, nullptr);
    6650             :                         }
    6651             :                     }
    6652             :                 }
    6653             : 
    6654             :                 // Compute the resulting overview block.
    6655        4820 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6656             :                 {
    6657        6560 :                     auto poJob = std::make_unique<OvrJob>();
    6658        3280 :                     poJob->pfnResampleFn = pfnResampleFn;
    6659        3280 :                     poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
    6660        6560 :                     poJob->args.eOvrDataType =
    6661        3280 :                         poJob->poDstBand->GetRasterDataType();
    6662        3280 :                     poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
    6663        3280 :                     poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
    6664        3280 :                     const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
    6665        3280 :                         "NBITS", "IMAGE_STRUCTURE");
    6666        3280 :                     poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    6667        3280 :                     poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    6668        3280 :                     poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    6669        3280 :                     poJob->args.eWrkDataType = eWrkDataType;
    6670        3280 :                     poJob->pChunk = apaChunk[iBand].get();
    6671        3280 :                     poJob->args.pabyChunkNodataMask =
    6672        3280 :                         apabyChunkNoDataMask[iBand].get();
    6673        3280 :                     poJob->args.nChunkXOff = nChunkXOffQueried;
    6674        3280 :                     poJob->args.nChunkXSize = nChunkXSizeQueried;
    6675        3280 :                     poJob->args.nChunkYOff = nChunkYOffQueried;
    6676        3280 :                     poJob->args.nChunkYSize = nChunkYSizeQueried;
    6677        3280 :                     poJob->args.nDstXOff = nDstXOff;
    6678        3280 :                     poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
    6679        3280 :                     poJob->args.nDstYOff = nDstYOff;
    6680        3280 :                     poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
    6681        3280 :                     poJob->args.pszResampling = pszResampling;
    6682        3280 :                     poJob->args.bHasNoData = abHasNoData[iBand];
    6683        3280 :                     poJob->args.dfNoDataValue = adfNoDataValue[iBand];
    6684        3280 :                     poJob->args.eSrcDataType = eDataType;
    6685        3280 :                     poJob->args.bPropagateNoData = bPropagateNoData;
    6686             : 
    6687        3280 :                     if (poJobQueue)
    6688             :                     {
    6689          16 :                         poJob->oSrcMaskBufferHolder =
    6690          32 :                             std::make_unique<PointerHolder>(
    6691          32 :                                 std::move(apabyChunkNoDataMask[iBand]));
    6692             : 
    6693          16 :                         poJob->oSrcBufferHolder =
    6694          32 :                             std::make_unique<PointerHolder>(
    6695          32 :                                 std::move(apaChunk[iBand]));
    6696             : 
    6697          16 :                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    6698          16 :                         jobList.emplace_back(std::move(poJob));
    6699             :                     }
    6700             :                     else
    6701             :                     {
    6702        3264 :                         JobResampleFunc(poJob.get());
    6703        3264 :                         eErr = poJob->eErr;
    6704        3264 :                         if (eErr == CE_None)
    6705             :                         {
    6706        3264 :                             eErr = WriteJobData(poJob.get());
    6707             :                         }
    6708             :                     }
    6709             :                 }
    6710             :             }
    6711             :         }
    6712             : 
    6713             :         // Wait for all pending jobs to complete
    6714         610 :         while (!jobList.empty())
    6715             :         {
    6716          14 :             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    6717          14 :             if (l_eErr != CE_None && eErr == CE_None)
    6718           0 :                 eErr = l_eErr;
    6719             :         }
    6720             : 
    6721             :         // Flush the data to overviews.
    6722        1753 :         for (int iBand = 0; iBand < nBands; ++iBand)
    6723             :         {
    6724        1157 :             if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
    6725             :                 CE_None)
    6726           0 :                 eErr = CE_Failure;
    6727             :         }
    6728             :     }
    6729             : 
    6730         381 :     if (eErr == CE_None)
    6731         377 :         pfnProgress(1.0, nullptr, pProgressData);
    6732             : 
    6733         381 :     return eErr;
    6734             : }
    6735             : 
    6736             : /************************************************************************/
    6737             : /*                  GDALRegenerateOverviewsMultiBand()                  */
    6738             : /************************************************************************/
    6739             : 
    6740             : /**
    6741             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    6742             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    6743             :  *
    6744             :  * This function will generate one or more overview images from a base
    6745             :  * image using the requested downsampling algorithm.  Its primary use
    6746             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    6747             :  * can also be used to generate downsampled images in one file from another
    6748             :  * outside the overview architecture.
    6749             :  *
    6750             :  * The output bands need to exist in advance and share the same characteristics
    6751             :  * (type, dimensions)
    6752             :  *
    6753             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    6754             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    6755             :  *
    6756             :  * It does not support color tables or complex data types.
    6757             :  *
    6758             :  * The pseudo-algorithm used by the function is :
    6759             :  *    for each overview
    6760             :  *       iterate on lines of the source by a step of deltay
    6761             :  *           iterate on columns of the source  by a step of deltax
    6762             :  *               read the source data of size deltax * deltay for all the bands
    6763             :  *               generate the corresponding overview block for all the bands
    6764             :  *
    6765             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    6766             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    6767             :  * considered as the nodata value and not each value of the triplet
    6768             :  * independently per band.
    6769             :  *
    6770             :  * The GDAL_NUM_THREADS configuration option can be set
    6771             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    6772             :  * overview computation.
    6773             :  *
    6774             :  * @param apoSrcBands the list of source bands to downsample
    6775             :  * @param aapoOverviewBands bidimension array of bands. First dimension is
    6776             :  *                          indexed by bands. Second dimension is indexed by
    6777             :  *                          overview levels. All aapoOverviewBands[i] arrays
    6778             :  *                          must have the same size (i.e. same number of
    6779             :  *                          overviews)
    6780             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    6781             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    6782             :  * @param pfnProgress progress report function.
    6783             :  * @param pProgressData progress function callback data.
    6784             :  * @param papszOptions NULL terminated list of options as
    6785             :  *                     key=value pairs, or NULL
    6786             :  *                     The XOFF, YOFF, XSIZE and YSIZE
    6787             :  *                     options can be specified to express that overviews should
    6788             :  *                     be regenerated only in the specified subset of the source
    6789             :  *                     dataset.
    6790             :  * @return CE_None on success or CE_Failure on failure.
    6791             :  * @since 3.10
    6792             :  */
    6793             : 
    6794          19 : CPLErr GDALRegenerateOverviewsMultiBand(
    6795             :     const std::vector<GDALRasterBand *> &apoSrcBands,
    6796             :     const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
    6797             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    6798             :     void *pProgressData, CSLConstList papszOptions)
    6799             : {
    6800          19 :     CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
    6801          29 :     for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
    6802             :     {
    6803          10 :         CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
    6804             :     }
    6805             : 
    6806          19 :     if (aapoOverviewBands.empty())
    6807           0 :         return CE_None;
    6808             : 
    6809          19 :     std::vector<GDALRasterBand **> apapoOverviewBands;
    6810          48 :     for (auto &apoOverviewBands : aapoOverviewBands)
    6811             :     {
    6812             :         auto papoOverviewBands = static_cast<GDALRasterBand **>(
    6813          29 :             CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
    6814          61 :         for (size_t i = 0; i < apoOverviewBands.size(); ++i)
    6815             :         {
    6816          32 :             papoOverviewBands[i] = apoOverviewBands[i];
    6817             :         }
    6818          29 :         apapoOverviewBands.push_back(papoOverviewBands);
    6819             :     }
    6820          38 :     const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
    6821          19 :         static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
    6822          19 :         static_cast<int>(aapoOverviewBands[0].size()),
    6823          19 :         apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
    6824             :         papszOptions);
    6825          48 :     for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
    6826          29 :         CPLFree(papoOverviewBands);
    6827          19 :     return eErr;
    6828             : }
    6829             : 
    6830             : /************************************************************************/
    6831             : /*                        GDALComputeBandStats()                        */
    6832             : /************************************************************************/
    6833             : 
    6834             : /** Undocumented
    6835             :  * @param hSrcBand undocumented.
    6836             :  * @param nSampleStep Step between scanlines used to compute statistics.
    6837             :  *                    When nSampleStep is equal to 1, all scanlines will
    6838             :  *                    be processed.
    6839             :  * @param pdfMean undocumented.
    6840             :  * @param pdfStdDev undocumented.
    6841             :  * @param pfnProgress undocumented.
    6842             :  * @param pProgressData undocumented.
    6843             :  * @return undocumented
    6844             :  */
    6845          18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
    6846             :                                         int nSampleStep, double *pdfMean,
    6847             :                                         double *pdfStdDev,
    6848             :                                         GDALProgressFunc pfnProgress,
    6849             :                                         void *pProgressData)
    6850             : 
    6851             : {
    6852          18 :     VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
    6853             : 
    6854          18 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    6855             : 
    6856          18 :     if (pfnProgress == nullptr)
    6857          18 :         pfnProgress = GDALDummyProgress;
    6858             : 
    6859          18 :     const int nWidth = poSrcBand->GetXSize();
    6860          18 :     const int nHeight = poSrcBand->GetYSize();
    6861             : 
    6862          18 :     if (nSampleStep >= nHeight || nSampleStep < 1)
    6863           5 :         nSampleStep = 1;
    6864             : 
    6865          18 :     GDALDataType eWrkType = GDT_Unknown;
    6866          18 :     float *pafData = nullptr;
    6867          18 :     GDALDataType eType = poSrcBand->GetRasterDataType();
    6868          18 :     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6869          18 :     if (bComplex)
    6870             :     {
    6871             :         pafData = static_cast<float *>(
    6872           0 :             VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6873           0 :         eWrkType = GDT_CFloat32;
    6874             :     }
    6875             :     else
    6876             :     {
    6877             :         pafData =
    6878          18 :             static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6879          18 :         eWrkType = GDT_Float32;
    6880             :     }
    6881             : 
    6882          18 :     if (nWidth == 0 || pafData == nullptr)
    6883             :     {
    6884           0 :         VSIFree(pafData);
    6885           0 :         return CE_Failure;
    6886             :     }
    6887             : 
    6888             :     /* -------------------------------------------------------------------- */
    6889             :     /*      Loop over all sample lines.                                     */
    6890             :     /* -------------------------------------------------------------------- */
    6891          18 :     double dfSum = 0.0;
    6892          18 :     double dfSum2 = 0.0;
    6893          18 :     int iLine = 0;
    6894          18 :     GIntBig nSamples = 0;
    6895             : 
    6896        2143 :     do
    6897             :     {
    6898        2161 :         if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6899             :                          pProgressData))
    6900             :         {
    6901           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6902           0 :             CPLFree(pafData);
    6903           0 :             return CE_Failure;
    6904             :         }
    6905             : 
    6906             :         const CPLErr eErr =
    6907        2161 :             poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
    6908             :                                 1, eWrkType, 0, 0, nullptr);
    6909        2161 :         if (eErr != CE_None)
    6910             :         {
    6911           1 :             CPLFree(pafData);
    6912           1 :             return eErr;
    6913             :         }
    6914             : 
    6915      725208 :         for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6916             :         {
    6917      723048 :             float fValue = 0.0f;
    6918             : 
    6919      723048 :             if (bComplex)
    6920             :             {
    6921             :                 // Compute the magnitude of the complex value.
    6922             :                 fValue =
    6923           0 :                     std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
    6924           0 :                                pafData[static_cast<size_t>(iPixel) * 2 + 1]);
    6925             :             }
    6926             :             else
    6927             :             {
    6928      723048 :                 fValue = pafData[iPixel];
    6929             :             }
    6930             : 
    6931      723048 :             dfSum += static_cast<double>(fValue);
    6932      723048 :             dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
    6933             :         }
    6934             : 
    6935        2160 :         nSamples += nWidth;
    6936        2160 :         iLine += nSampleStep;
    6937        2160 :     } while (iLine < nHeight);
    6938             : 
    6939          17 :     if (!pfnProgress(1.0, nullptr, pProgressData))
    6940             :     {
    6941           0 :         CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6942           0 :         CPLFree(pafData);
    6943           0 :         return CE_Failure;
    6944             :     }
    6945             : 
    6946             :     /* -------------------------------------------------------------------- */
    6947             :     /*      Produce the result values.                                      */
    6948             :     /* -------------------------------------------------------------------- */
    6949          17 :     if (pdfMean != nullptr)
    6950          17 :         *pdfMean = dfSum / nSamples;
    6951             : 
    6952          17 :     if (pdfStdDev != nullptr)
    6953             :     {
    6954          17 :         const double dfMean = dfSum / nSamples;
    6955             : 
    6956          17 :         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
    6957             :     }
    6958             : 
    6959          17 :     CPLFree(pafData);
    6960             : 
    6961          17 :     return CE_None;
    6962             : }
    6963             : 
    6964             : /************************************************************************/
    6965             : /*                  GDALOverviewMagnitudeCorrection()                   */
    6966             : /*                                                                      */
    6967             : /*      Correct the mean and standard deviation of the overviews of     */
    6968             : /*      the given band to match the base layer approximately.           */
    6969             : /************************************************************************/
    6970             : 
    6971             : /** Undocumented
    6972             :  * @param hBaseBand undocumented.
    6973             :  * @param nOverviewCount undocumented.
    6974             :  * @param pahOverviews undocumented.
    6975             :  * @param pfnProgress undocumented.
    6976             :  * @param pProgressData undocumented.
    6977             :  * @return undocumented
    6978             :  */
    6979           0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
    6980             :                                        int nOverviewCount,
    6981             :                                        GDALRasterBandH *pahOverviews,
    6982             :                                        GDALProgressFunc pfnProgress,
    6983             :                                        void *pProgressData)
    6984             : 
    6985             : {
    6986           0 :     VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
    6987             : 
    6988             :     /* -------------------------------------------------------------------- */
    6989             :     /*      Compute mean/stddev for source raster.                          */
    6990             :     /* -------------------------------------------------------------------- */
    6991           0 :     double dfOrigMean = 0.0;
    6992           0 :     double dfOrigStdDev = 0.0;
    6993             :     {
    6994             :         const CPLErr eErr =
    6995           0 :             GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
    6996             :                                  pfnProgress, pProgressData);
    6997             : 
    6998           0 :         if (eErr != CE_None)
    6999           0 :             return eErr;
    7000             :     }
    7001             : 
    7002             :     /* -------------------------------------------------------------------- */
    7003             :     /*      Loop on overview bands.                                         */
    7004             :     /* -------------------------------------------------------------------- */
    7005           0 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    7006             :     {
    7007             :         GDALRasterBand *poOverview =
    7008           0 :             GDALRasterBand::FromHandle(pahOverviews[iOverview]);
    7009             :         double dfOverviewMean, dfOverviewStdDev;
    7010             : 
    7011             :         const CPLErr eErr =
    7012           0 :             GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
    7013             :                                  &dfOverviewStdDev, pfnProgress, pProgressData);
    7014             : 
    7015           0 :         if (eErr != CE_None)
    7016           0 :             return eErr;
    7017             : 
    7018           0 :         double dfGain = 1.0;
    7019           0 :         if (dfOrigStdDev >= 0.0001)
    7020           0 :             dfGain = dfOrigStdDev / dfOverviewStdDev;
    7021             : 
    7022             :         /* --------------------------------------------------------------------
    7023             :          */
    7024             :         /*      Apply gain and offset. */
    7025             :         /* --------------------------------------------------------------------
    7026             :          */
    7027           0 :         const int nWidth = poOverview->GetXSize();
    7028           0 :         const int nHeight = poOverview->GetYSize();
    7029             : 
    7030           0 :         GDALDataType eWrkType = GDT_Unknown;
    7031           0 :         float *pafData = nullptr;
    7032           0 :         const GDALDataType eType = poOverview->GetRasterDataType();
    7033           0 :         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    7034           0 :         if (bComplex)
    7035             :         {
    7036             :             pafData = static_cast<float *>(
    7037           0 :                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    7038           0 :             eWrkType = GDT_CFloat32;
    7039             :         }
    7040             :         else
    7041             :         {
    7042             :             pafData = static_cast<float *>(
    7043           0 :                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    7044           0 :             eWrkType = GDT_Float32;
    7045             :         }
    7046             : 
    7047           0 :         if (pafData == nullptr)
    7048             :         {
    7049           0 :             return CE_Failure;
    7050             :         }
    7051             : 
    7052           0 :         for (int iLine = 0; iLine < nHeight; ++iLine)
    7053             :         {
    7054           0 :             if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    7055             :                              pProgressData))
    7056             :             {
    7057           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    7058           0 :                 CPLFree(pafData);
    7059           0 :                 return CE_Failure;
    7060             :             }
    7061             : 
    7062           0 :             if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
    7063             :                                      nWidth, 1, eWrkType, 0, 0,
    7064           0 :                                      nullptr) != CE_None)
    7065             :             {
    7066           0 :                 CPLFree(pafData);
    7067           0 :                 return CE_Failure;
    7068             :             }
    7069             : 
    7070           0 :             for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    7071             :             {
    7072           0 :                 if (bComplex)
    7073             :                 {
    7074           0 :                     pafData[static_cast<size_t>(iPixel) * 2] *=
    7075           0 :                         static_cast<float>(dfGain);
    7076           0 :                     pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
    7077           0 :                         static_cast<float>(dfGain);
    7078             :                 }
    7079             :                 else
    7080             :                 {
    7081           0 :                     pafData[iPixel] = static_cast<float>(
    7082           0 :                         (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
    7083             :                         dfOrigMean);
    7084             :                 }
    7085             :             }
    7086             : 
    7087           0 :             if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
    7088             :                                      nWidth, 1, eWrkType, 0, 0,
    7089           0 :                                      nullptr) != CE_None)
    7090             :             {
    7091           0 :                 CPLFree(pafData);
    7092           0 :                 return CE_Failure;
    7093             :             }
    7094             :         }
    7095             : 
    7096           0 :         if (!pfnProgress(1.0, nullptr, pProgressData))
    7097             :         {
    7098           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    7099           0 :             CPLFree(pafData);
    7100           0 :             return CE_Failure;
    7101             :         }
    7102             : 
    7103           0 :         CPLFree(pafData);
    7104             :     }
    7105             : 
    7106           0 :     return CE_None;
    7107             : }

Generated by: LCOV version 1.14