LCOV - code coverage report
Current view: top level - gcore - overview.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2704 3078 87.8 %
Date: 2025-09-10 17:48:50 Functions: 166 173 96.0 %

          Line data    Source code
       1             : 
       2             : /******************************************************************************
       3             :  *
       4             :  * Project:  GDAL Core
       5             :  * Purpose:  Helper code to implement overview support in different drivers.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2000, Frank Warmerdam
      10             :  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal_priv.h"
      17             : 
      18             : #include <cmath>
      19             : #include <cstddef>
      20             : #include <cstdlib>
      21             : 
      22             : #include <algorithm>
      23             : #include <complex>
      24             : #include <condition_variable>
      25             : #include <limits>
      26             : #include <list>
      27             : #include <memory>
      28             : #include <mutex>
      29             : #include <vector>
      30             : 
      31             : #include "cpl_conv.h"
      32             : #include "cpl_error.h"
      33             : #include "cpl_float.h"
      34             : #include "cpl_progress.h"
      35             : #include "cpl_vsi.h"
      36             : #include "gdal.h"
      37             : #include "gdal_thread_pool.h"
      38             : #include "gdalwarper.h"
      39             : #include "gdal_vrt.h"
      40             : #include "vrtdataset.h"
      41             : 
      42             : #ifdef USE_NEON_OPTIMIZATIONS
      43             : #include "include_sse2neon.h"
      44             : 
      45             : #if (!defined(__aarch64__) && !defined(_M_ARM64))
      46             : #define ARM_V7
      47             : #endif
      48             : 
      49             : #define USE_SSE2
      50             : 
      51             : #include "gdalsse_priv.h"
      52             : 
      53             : // Restrict to 64bit processors because they are guaranteed to have SSE2,
      54             : // or if __AVX2__ is defined.
      55             : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
      56             : #define USE_SSE2
      57             : 
      58             : #include "gdalsse_priv.h"
      59             : 
      60             : #ifdef __SSE3__
      61             : #include <pmmintrin.h>
      62             : #endif
      63             : #ifdef __SSSE3__
      64             : #include <tmmintrin.h>
      65             : #endif
      66             : #ifdef __SSE4_1__
      67             : #include <smmintrin.h>
      68             : #endif
      69             : #ifdef __AVX2__
      70             : #include <immintrin.h>
      71             : #endif
      72             : 
      73             : #endif
      74             : 
      75             : // To be included after above USE_SSE2 and include gdalsse_priv.h
      76             : // to avoid build issue on Windows x86
      77             : #include "gdal_priv_templates.hpp"
      78             : 
      79             : /************************************************************************/
      80             : /*                      GDALResampleChunk_Near()                        */
      81             : /************************************************************************/
      82             : 
      83             : template <class T>
      84        1245 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
      85             :                                       const T *pChunk, T **ppDstBuffer)
      86             : 
      87             : {
      88        1245 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
      89        1245 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
      90        1245 :     const GDALDataType eWrkDataType = args.eWrkDataType;
      91        1245 :     const int nChunkXOff = args.nChunkXOff;
      92        1245 :     const int nChunkXSize = args.nChunkXSize;
      93        1245 :     const int nChunkYOff = args.nChunkYOff;
      94        1245 :     const int nDstXOff = args.nDstXOff;
      95        1245 :     const int nDstXOff2 = args.nDstXOff2;
      96        1245 :     const int nDstYOff = args.nDstYOff;
      97        1245 :     const int nDstYOff2 = args.nDstYOff2;
      98        1245 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
      99             : 
     100             :     /* -------------------------------------------------------------------- */
     101             :     /*      Allocate buffers.                                               */
     102             :     /* -------------------------------------------------------------------- */
     103        1245 :     *ppDstBuffer = static_cast<T *>(
     104        1245 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
     105             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
     106        1245 :     if (*ppDstBuffer == nullptr)
     107             :     {
     108           0 :         return CE_Failure;
     109             :     }
     110        1245 :     T *const pDstBuffer = *ppDstBuffer;
     111             : 
     112             :     int *panSrcXOff =
     113        1245 :         static_cast<int *>(VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(int)));
     114             : 
     115        1245 :     if (panSrcXOff == nullptr)
     116             :     {
     117           0 :         return CE_Failure;
     118             :     }
     119             : 
     120             :     /* ==================================================================== */
     121             :     /*      Precompute inner loop constants.                                */
     122             :     /* ==================================================================== */
     123      842563 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
     124             :     {
     125      841318 :         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
     126      841318 :         if (nSrcXOff < nChunkXOff)
     127           0 :             nSrcXOff = nChunkXOff;
     128             : 
     129      841318 :         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
     130             :     }
     131             : 
     132             :     /* ==================================================================== */
     133             :     /*      Loop over destination scanlines.                                */
     134             :     /* ==================================================================== */
     135      142379 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
     136             :     {
     137      141134 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
     138      141134 :         if (nSrcYOff < nChunkYOff)
     139           0 :             nSrcYOff = nChunkYOff;
     140             : 
     141      141134 :         const T *const pSrcScanline =
     142             :             pChunk +
     143      141134 :             (static_cast<size_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
     144      138100 :             nChunkXOff;
     145             : 
     146             :         /* --------------------------------------------------------------------
     147             :          */
     148             :         /*      Loop over destination pixels */
     149             :         /* --------------------------------------------------------------------
     150             :          */
     151      141134 :         T *pDstScanline =
     152      141134 :             pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
     153   119889794 :         for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
     154             :         {
     155   119748760 :             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
     156             :         }
     157             :     }
     158             : 
     159        1245 :     CPLFree(panSrcXOff);
     160             : 
     161        1245 :     return CE_None;
     162             : }
     163             : 
     164        1245 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
     165             :                                      const void *pChunk, void **ppDstBuffer,
     166             :                                      GDALDataType *peDstBufferDataType)
     167             : {
     168        1245 :     *peDstBufferDataType = args.eWrkDataType;
     169        1245 :     switch (args.eWrkDataType)
     170             :     {
     171             :         // For nearest resampling, as no computation is done, only the
     172             :         // size of the data type matters.
     173        1088 :         case GDT_Byte:
     174             :         case GDT_Int8:
     175             :         {
     176        1088 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
     177        1088 :             return GDALResampleChunk_NearT(
     178             :                 args, static_cast<const uint8_t *>(pChunk),
     179        1088 :                 reinterpret_cast<uint8_t **>(ppDstBuffer));
     180             :         }
     181             : 
     182          52 :         case GDT_Int16:
     183             :         case GDT_UInt16:
     184             :         case GDT_Float16:
     185             :         {
     186          52 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
     187          52 :             return GDALResampleChunk_NearT(
     188             :                 args, static_cast<const uint16_t *>(pChunk),
     189          52 :                 reinterpret_cast<uint16_t **>(ppDstBuffer));
     190             :         }
     191             : 
     192          57 :         case GDT_CInt16:
     193             :         case GDT_CFloat16:
     194             :         case GDT_Int32:
     195             :         case GDT_UInt32:
     196             :         case GDT_Float32:
     197             :         {
     198          57 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
     199          57 :             return GDALResampleChunk_NearT(
     200             :                 args, static_cast<const uint32_t *>(pChunk),
     201          57 :                 reinterpret_cast<uint32_t **>(ppDstBuffer));
     202             :         }
     203             : 
     204          44 :         case GDT_CInt32:
     205             :         case GDT_CFloat32:
     206             :         case GDT_Int64:
     207             :         case GDT_UInt64:
     208             :         case GDT_Float64:
     209             :         {
     210          44 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
     211          44 :             return GDALResampleChunk_NearT(
     212             :                 args, static_cast<const uint64_t *>(pChunk),
     213          44 :                 reinterpret_cast<uint64_t **>(ppDstBuffer));
     214             :         }
     215             : 
     216           4 :         case GDT_CFloat64:
     217             :         {
     218           4 :             return GDALResampleChunk_NearT(
     219             :                 args, static_cast<const std::complex<double> *>(pChunk),
     220           4 :                 reinterpret_cast<std::complex<double> **>(ppDstBuffer));
     221             :         }
     222             : 
     223           0 :         case GDT_Unknown:
     224             :         case GDT_TypeCount:
     225           0 :             break;
     226             :     }
     227           0 :     CPLAssert(false);
     228             :     return CE_Failure;
     229             : }
     230             : 
     231             : namespace
     232             : {
     233             : 
     234             : // Find in the color table the entry whose RGB value is the closest
     235             : // (using quadratic distance) to the test color, ignoring transparent entries.
     236        3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
     237             :                    const GDALColorEntry &test)
     238             : {
     239        3837 :     int nMinDist = std::numeric_limits<int>::max();
     240        3837 :     size_t bestEntry = 0;
     241      986109 :     for (size_t i = 0; i < entries.size(); ++i)
     242             :     {
     243      982272 :         const GDALColorEntry &entry = entries[i];
     244             :         // Ignore transparent entries
     245      982272 :         if (entry.c4 == 0)
     246        3237 :             continue;
     247             : 
     248      979035 :         int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
     249      979035 :                     ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
     250      979035 :                     ((test.c3 - entry.c3) * (test.c3 - entry.c3));
     251      979035 :         if (nDist < nMinDist)
     252             :         {
     253       15847 :             nMinDist = nDist;
     254       15847 :             bestEntry = i;
     255             :         }
     256             :     }
     257        3837 :     return static_cast<int>(bestEntry);
     258             : }
     259             : 
     260           7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
     261             :                                            int &transparentIdx)
     262             : {
     263           7 :     std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
     264             : 
     265           7 :     transparentIdx = -1;
     266           7 :     int i = 0;
     267        1799 :     for (auto &entry : entries)
     268             :     {
     269        1792 :         table.GetColorEntryAsRGB(i, &entry);
     270        1792 :         if (transparentIdx < 0 && entry.c4 == 0)
     271           1 :             transparentIdx = i;
     272        1792 :         ++i;
     273             :     }
     274           7 :     return entries;
     275             : }
     276             : 
     277             : }  // unnamed  namespace
     278             : 
     279             : /************************************************************************/
     280             : /*                             SQUARE()                                 */
     281             : /************************************************************************/
     282             : 
     283        4897 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
     284             : {
     285        4897 :     return static_cast<Tsquare>(val) * val;
     286             : }
     287             : 
     288             : /************************************************************************/
     289             : /*                          ComputeIntegerRMS()                         */
     290             : /************************************************************************/
     291             : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
     292             : // integer that minimizes abs(rms**2 - sumSquares / weight)
     293             : template <class T, class Twork>
     294          42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
     295             : {
     296          42 :     const double sumDivWeight = sumSquares / weight;
     297          42 :     T rms = static_cast<T>(sqrt(sumDivWeight));
     298             : 
     299             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     300             :     // Naive version:
     301             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     302          42 :     if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
     303          42 :         2 * sumDivWeight)
     304           6 :         rms += 1;
     305          42 :     return rms;
     306             : }
     307             : 
     308             : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
     309             : {
     310             :     CPLAssert(false);
     311             :     return 0;
     312             : }
     313             : 
     314          28 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
     315             : {
     316             :     // It has been verified that given the correction on rms below, using
     317             :     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
     318             :     // is equivalent, so use the former as it is used twice.
     319          28 :     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
     320          28 :     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
     321          28 :     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
     322             : 
     323             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     324             :     // Naive version:
     325             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     326             :     // Optimized version for integer case and weight == 4
     327          28 :     if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
     328           5 :         rms += 1;
     329          28 :     return rms;
     330             : }
     331             : 
     332             : template <>
     333          24 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
     334             : {
     335          24 :     const double sumDivWeight = sumSquares * 0.25;
     336          24 :     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
     337             : 
     338             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     339             :     // Naive version:
     340             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     341             :     // Optimized version for integer case and weight == 4
     342          24 :     if (static_cast<GUInt32>(rms) * (rms + 1) <
     343          24 :         static_cast<GUInt32>(sumDivWeight + 0.25))
     344           4 :         rms += 1;
     345          24 :     return rms;
     346             : }
     347             : 
     348             : #ifdef USE_SSE2
     349             : 
     350             : /************************************************************************/
     351             : /*                   QuadraticMeanByteSSE2OrAVX2()                      */
     352             : /************************************************************************/
     353             : 
     354             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     355             : #define sse2_packus_epi32 _mm_packus_epi32
     356             : #else
     357      516139 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
     358             : {
     359      516139 :     const auto minus32768_32 = _mm_set1_epi32(-32768);
     360      516139 :     const auto minus32768_16 = _mm_set1_epi16(-32768);
     361      516139 :     a = _mm_add_epi32(a, minus32768_32);
     362      516139 :     b = _mm_add_epi32(b, minus32768_32);
     363      516139 :     a = _mm_packs_epi32(a, b);
     364      516139 :     a = _mm_sub_epi16(a, minus32768_16);
     365      516139 :     return a;
     366             : }
     367             : #endif
     368             : 
     369             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     370             : #define sse2_hadd_epi16 _mm_hadd_epi16
     371             : #else
     372     4715530 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
     373             : {
     374             :     // Horizontal addition of adjacent pairs
     375     4715530 :     const auto mask = _mm_set1_epi32(0xFFFF);
     376             :     const auto horizLo =
     377    14146600 :         _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
     378             :     const auto horizHi =
     379    14146600 :         _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
     380             : 
     381             :     // Recombine low and high parts
     382     4715530 :     return _mm_packs_epi32(horizLo, horizHi);
     383             : }
     384             : #endif
     385             : 
     386             : #ifdef __AVX2__
     387             : 
     388             : #define set1_epi16 _mm256_set1_epi16
     389             : #define set1_epi32 _mm256_set1_epi32
     390             : #define setzero _mm256_setzero_si256
     391             : #define set1_ps _mm256_set1_ps
     392             : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
     393             : #define unpacklo_epi8 _mm256_unpacklo_epi8
     394             : #define unpackhi_epi8 _mm256_unpackhi_epi8
     395             : #define madd_epi16 _mm256_madd_epi16
     396             : #define add_epi32 _mm256_add_epi32
     397             : #define mul_ps _mm256_mul_ps
     398             : #define cvtepi32_ps _mm256_cvtepi32_ps
     399             : #define sqrt_ps _mm256_sqrt_ps
     400             : #define cvttps_epi32 _mm256_cvttps_epi32
     401             : #define packs_epi32 _mm256_packs_epi32
     402             : #define packus_epi32 _mm256_packus_epi32
     403             : #define srli_epi32 _mm256_srli_epi32
     404             : #define mullo_epi16 _mm256_mullo_epi16
     405             : #define srli_epi16 _mm256_srli_epi16
     406             : #define cmpgt_epi16 _mm256_cmpgt_epi16
     407             : #define add_epi16 _mm256_add_epi16
     408             : #define sub_epi16 _mm256_sub_epi16
     409             : #define packus_epi16 _mm256_packus_epi16
     410             : 
     411             : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
     412             : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
     413             :  */
     414             : 
     415             : inline __m256i FIXUP_LANES(__m256i x)
     416             : {
     417             :     return _mm256_permute4x64_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
     418             : }
     419             : 
     420             : #define store_lo(x, y)                                                         \
     421             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
     422             :                      _mm256_extracti128_si256(FIXUP_LANES(y), 0))
     423             : #define storeu_int(x, y)                                                       \
     424             :     _mm256_storeu_si256(reinterpret_cast<__m256i *>(x), FIXUP_LANES(y))
     425             : #define hadd_epi16 _mm256_hadd_epi16
     426             : #else
     427             : #define set1_epi16 _mm_set1_epi16
     428             : #define set1_epi32 _mm_set1_epi32
     429             : #define setzero _mm_setzero_si128
     430             : #define set1_ps _mm_set1_ps
     431             : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
     432             : #define unpacklo_epi8 _mm_unpacklo_epi8
     433             : #define unpackhi_epi8 _mm_unpackhi_epi8
     434             : #define madd_epi16 _mm_madd_epi16
     435             : #define add_epi32 _mm_add_epi32
     436             : #define mul_ps _mm_mul_ps
     437             : #define cvtepi32_ps _mm_cvtepi32_ps
     438             : #define sqrt_ps _mm_sqrt_ps
     439             : #define cvttps_epi32 _mm_cvttps_epi32
     440             : #define packs_epi32 _mm_packs_epi32
     441             : #define packus_epi32 sse2_packus_epi32
     442             : #define srli_epi32 _mm_srli_epi32
     443             : #define mullo_epi16 _mm_mullo_epi16
     444             : #define srli_epi16 _mm_srli_epi16
     445             : #define cmpgt_epi16 _mm_cmpgt_epi16
     446             : #define add_epi16 _mm_add_epi16
     447             : #define sub_epi16 _mm_sub_epi16
     448             : #define packus_epi16 _mm_packus_epi16
     449             : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
     450             : #define storeu_int(x, y) _mm_storeu_si128(reinterpret_cast<__m128i *>(x), (y))
     451             : #define hadd_epi16 sse2_hadd_epi16
     452             : #endif
     453             : 
     454             : template <class T>
     455             : static int
     456             : #if defined(__GNUC__)
     457             :     __attribute__((noinline))
     458             : #endif
     459        5389 :     QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     460             :                                 const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     461             :                                 T *CPL_RESTRICT pDstScanline)
     462             : {
     463             :     // Optimized implementation for RMS on Byte by
     464             :     // processing by group of 8 output pixels, so as to use
     465             :     // a single _mm_sqrt_ps() call for 4 output pixels
     466        5389 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     467             : 
     468        5389 :     int iDstPixel = 0;
     469        5389 :     const auto one16 = set1_epi16(1);
     470        5389 :     const auto one32 = set1_epi32(1);
     471        5389 :     const auto zero = setzero();
     472        5389 :     const auto minus32768 = set1_epi16(-32768);
     473             : 
     474        5389 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
     475      521504 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     476             :     {
     477             :         // Load 2 * DEST_ELTS bytes from each line
     478      516115 :         auto firstLine = loadu_int(pSrcScanlineShifted);
     479     1032230 :         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     480             :         // Extend those Bytes as UInt16s
     481      516115 :         auto firstLineLo = unpacklo_epi8(firstLine, zero);
     482      516115 :         auto firstLineHi = unpackhi_epi8(firstLine, zero);
     483      516115 :         auto secondLineLo = unpacklo_epi8(secondLine, zero);
     484      516115 :         auto secondLineHi = unpackhi_epi8(secondLine, zero);
     485             : 
     486             :         // Multiplication of 16 bit values and horizontal
     487             :         // addition of 32 bit results
     488             :         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
     489      516115 :         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
     490      516115 :         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
     491      516115 :         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
     492      516115 :         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
     493             : 
     494             :         // Vertical addition
     495      516115 :         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
     496      516115 :         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
     497             : 
     498             :         const auto sumSquaresPlusOneDiv4Lo =
     499     1032230 :             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
     500             :         const auto sumSquaresPlusOneDiv4Hi =
     501     1032230 :             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
     502             : 
     503             :         // Take square root and truncate/floor to int32
     504             :         const auto rmsLo =
     505     1548340 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
     506             :         const auto rmsHi =
     507     1548340 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
     508             : 
     509             :         // Merge back low and high registers with each RMS value
     510             :         // as a 16 bit value.
     511      516115 :         auto rms = packs_epi32(rmsLo, rmsHi);
     512             : 
     513             :         // Round to upper value if it minimizes the
     514             :         // error |rms^2 - sumSquares/4|
     515             :         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     516             :         //    rms += 1;
     517             :         // which is equivalent to:
     518             :         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
     519             :         //    rms += 1;
     520             :         // And both left and right parts fit on 16 (unsigned) bits
     521             :         const auto sumSquaresPlusOneDiv4 =
     522      516115 :             packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
     523             :         // cmpgt_epi16 operates on signed int16, but here
     524             :         // we have unsigned values, so shift them by -32768 before
     525     2580580 :         const auto mask = cmpgt_epi16(
     526             :             add_epi16(sumSquaresPlusOneDiv4, minus32768),
     527             :             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
     528             :         // The value of the mask will be -1 when the correction needs to be
     529             :         // applied
     530      516115 :         rms = sub_epi16(rms, mask);
     531             : 
     532             :         // Pack each 16 bit RMS value to 8 bits
     533      516115 :         rms = packus_epi16(rms, rms /* could be anything */);
     534      516115 :         store_lo(&pDstScanline[iDstPixel], rms);
     535      516115 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     536             :     }
     537             : 
     538        5389 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     539        5389 :     return iDstPixel;
     540             : }
     541             : 
     542             : /************************************************************************/
     543             : /*                      AverageByteSSE2OrAVX2()                         */
     544             : /************************************************************************/
     545             : 
     546             : static int
     547      111734 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     548             :                       const GByte *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     549             :                       GByte *CPL_RESTRICT pDstScanline)
     550             : {
     551             :     // Optimized implementation for average on Byte by
     552             :     // processing by group of 16 output pixels for SSE2, or 32 for AVX2
     553             : 
     554      111734 :     const auto zero = setzero();
     555      111734 :     const auto two16 = set1_epi16(2);
     556      111734 :     const GByte *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     557             : 
     558      111734 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(zero)) / 2;
     559      111734 :     int iDstPixel = 0;
     560     2469500 :     for (; iDstPixel < nDstXWidth - (2 * DEST_ELTS - 1);
     561     2357770 :          iDstPixel += 2 * DEST_ELTS)
     562             :     {
     563             :         decltype(setzero()) average0;
     564             :         {
     565             :             // Load 2 * DEST_ELTS bytes from each line
     566     2357770 :             const auto firstLine = loadu_int(pSrcScanlineShifted);
     567             :             const auto secondLine =
     568     4715530 :                 loadu_int(pSrcScanlineShifted + nChunkXSize);
     569             :             // Extend those Bytes as UInt16s
     570     2357770 :             const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     571     2357770 :             const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     572     2357770 :             const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     573     2357770 :             const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     574             : 
     575             :             // Vertical addition
     576     2357770 :             const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     577     2357770 :             const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     578             : 
     579             :             // Horizontal addition of adjacent pairs, and recombine low and high
     580             :             // parts
     581     2357770 :             const auto sum = hadd_epi16(sumLo, sumHi);
     582             : 
     583             :             // average = (sum + 2) / 4
     584     2357770 :             average0 = srli_epi16(add_epi16(sum, two16), 2);
     585             : 
     586     2357770 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     587             :         }
     588             : 
     589             :         decltype(setzero()) average1;
     590             :         {
     591             :             // Load 2 * DEST_ELTS bytes from each line
     592     2357770 :             const auto firstLine = loadu_int(pSrcScanlineShifted);
     593             :             const auto secondLine =
     594     4715530 :                 loadu_int(pSrcScanlineShifted + nChunkXSize);
     595             :             // Extend those Bytes as UInt16s
     596     2357770 :             const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     597     2357770 :             const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     598     2357770 :             const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     599     2357770 :             const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     600             : 
     601             :             // Vertical addition
     602     2357770 :             const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     603     2357770 :             const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     604             : 
     605             :             // Horizontal addition of adjacent pairs, and recombine low and high
     606             :             // parts
     607     2357770 :             const auto sum = hadd_epi16(sumLo, sumHi);
     608             : 
     609             :             // average = (sum + 2) / 4
     610     2357770 :             average1 = srli_epi16(add_epi16(sum, two16), 2);
     611             : 
     612     2357770 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     613             :         }
     614             : 
     615             :         // Pack each 16 bit average value to 8 bits
     616     2357770 :         const auto average = packus_epi16(average0, average1);
     617     2357770 :         storeu_int(&pDstScanline[iDstPixel], average);
     618             :     }
     619             : 
     620      111734 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     621      111734 :     return iDstPixel;
     622             : }
     623             : 
     624             : /************************************************************************/
     625             : /*                     QuadraticMeanUInt16SSE2()                        */
     626             : /************************************************************************/
     627             : 
     628             : #ifdef __SSE3__
     629             : #define sse2_hadd_pd _mm_hadd_pd
     630             : #else
     631         185 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
     632             : {
     633             :     auto aLo_bLo =
     634         740 :         _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
     635             :     auto aHi_bHi =
     636         740 :         _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
     637         185 :     return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
     638             : }
     639             : #endif
     640             : 
     641         120 : inline __m128d SQUARE_PD(__m128d x)
     642             : {
     643         120 :     return _mm_mul_pd(x, x);
     644             : }
     645             : 
     646             : #ifdef __AVX2__
     647             : 
     648             : inline __m256d SQUARE_PD(__m256d x)
     649             : {
     650             :     return _mm256_mul_pd(x, x);
     651             : }
     652             : 
     653             : inline __m256d FIXUP_LANES(__m256d x)
     654             : {
     655             :     return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
     656             : }
     657             : 
     658             : inline __m256 FIXUP_LANES(__m256 x)
     659             : {
     660             :     return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
     661             : }
     662             : 
     663             : #endif
     664             : 
     665             : static int
     666          14 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
     667             :                         const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     668             :                         uint16_t *CPL_RESTRICT pDstScanline)
     669             : {
     670             :     // Optimized implementation for RMS on UInt16 by
     671             :     // processing by group of 4 output pixels.
     672          14 :     const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     673             : 
     674          14 :     int iDstPixel = 0;
     675          14 :     const auto zero = _mm_setzero_si128();
     676             : 
     677             : #ifdef __AVX2__
     678             :     const auto zeroDot25 = _mm256_set1_pd(0.25);
     679             :     const auto zeroDot5 = _mm256_set1_pd(0.5);
     680             : 
     681             :     // The first four 0's could be anything, as we only take the bottom
     682             :     // 128 bits.
     683             :     const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
     684             : #else
     685          14 :     const auto zeroDot25 = _mm_set1_pd(0.25);
     686          14 :     const auto zeroDot5 = _mm_set1_pd(0.5);
     687             : #endif
     688             : 
     689          14 :     constexpr int DEST_ELTS =
     690             :         static_cast<int>(sizeof(zero) / sizeof(uint16_t)) / 2;
     691          52 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     692             :     {
     693             :         // Load 8 UInt16 from each line
     694          38 :         const auto firstLine = _mm_loadu_si128(
     695             :             reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     696             :         const auto secondLine =
     697          38 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     698          38 :                 pSrcScanlineShifted + nChunkXSize));
     699             : 
     700             :         // Detect if all of the source values fit in 14 bits.
     701             :         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
     702             :         // and we can do a much faster implementation.
     703             :         const auto maskTmp =
     704          76 :             _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
     705             : #if defined(__i386__) || defined(_M_IX86)
     706             :         uint64_t nMaskFitsIn14Bits = 0;
     707             :         _mm_storel_epi64(
     708             :             reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
     709             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     710             : #else
     711          38 :         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
     712             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     713             : #endif
     714          38 :         if (nMaskFitsIn14Bits == 0)
     715             :         {
     716             :             // Multiplication of 16 bit values and horizontal
     717             :             // addition of 32 bit results
     718             :             const auto firstLineHSumSquare =
     719          26 :                 _mm_madd_epi16(firstLine, firstLine);
     720             :             const auto secondLineHSumSquare =
     721          26 :                 _mm_madd_epi16(secondLine, secondLine);
     722             :             // Vertical addition
     723             :             const auto sumSquares =
     724          26 :                 _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
     725             :             // In theory we should take sqrt(sumSquares * 0.25f)
     726             :             // but given the rounding we do, this is equivalent to
     727             :             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
     728             :             // sumSquares <= 4 * 16383^2
     729          26 :             const auto one32 = _mm_set1_epi32(1);
     730             :             const auto sumSquaresPlusOneDiv4 =
     731          52 :                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
     732             :             // Take square root and truncate/floor to int32
     733          78 :             auto rms = _mm_cvttps_epi32(
     734             :                 _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
     735             : 
     736             :             // Round to upper value if it minimizes the
     737             :             // error |rms^2 - sumSquares/4|
     738             :             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     739             :             //    rms += 1;
     740             :             // which is equivalent to:
     741             :             // if( rms * rms + rms < (sumSquares+1) / 4 )
     742             :             //    rms += 1;
     743             :             auto mask =
     744          78 :                 _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
     745             :                                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
     746          26 :             rms = _mm_sub_epi32(rms, mask);
     747             :             // Pack each 32 bit RMS value to 16 bits
     748          26 :             rms = _mm_packs_epi32(rms, rms /* could be anything */);
     749             :             _mm_storel_epi64(
     750          26 :                 reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
     751          26 :             pSrcScanlineShifted += 2 * DEST_ELTS;
     752          26 :             continue;
     753             :         }
     754             : 
     755             :         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
     756             :         // to 32 bit would result in 4 multiplications instead of 8, but
     757             :         // mullo/mulhi have a worse throughput than mul_pd.
     758             : 
     759             :         // Extend those UInt16s as UInt32s
     760          12 :         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
     761          12 :         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
     762          12 :         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
     763          12 :         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
     764             : 
     765             : #ifdef __AVX2__
     766             :         // Multiplication of 32 bit values previously converted to 64 bit double
     767             :         const auto firstLineLoDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineLo));
     768             :         const auto firstLineHiDbl = SQUARE_PD(_mm256_cvtepi32_pd(firstLineHi));
     769             :         const auto secondLineLoDbl =
     770             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineLo));
     771             :         const auto secondLineHiDbl =
     772             :             SQUARE_PD(_mm256_cvtepi32_pd(secondLineHi));
     773             : 
     774             :         // Vertical addition of squares
     775             :         const auto sumSquaresLo =
     776             :             _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
     777             :         const auto sumSquaresHi =
     778             :             _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
     779             : 
     780             :         // Horizontal addition of squares
     781             :         const auto sumSquares =
     782             :             FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
     783             : 
     784             :         const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
     785             : 
     786             :         // Take square root and truncate/floor to int32
     787             :         auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
     788             :         const auto rmsDouble = _mm256_cvtepi32_pd(rms);
     789             :         const auto right = _mm256_sub_pd(
     790             :             sumDivWeight, _mm256_add_pd(SQUARE_PD(rmsDouble), rmsDouble));
     791             : 
     792             :         auto mask =
     793             :             _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
     794             :         // Extract 32-bit from each of the 4 64-bit masks
     795             :         // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
     796             :         // _MM_SHUFFLE(2,0,2,0)));
     797             :         mask = _mm256_permutevar8x32_ps(mask, permutation);
     798             :         const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
     799             : 
     800             :         // Apply the correction
     801             :         rms = _mm_sub_epi32(rms, maskI);
     802             : 
     803             :         // Pack each 32 bit RMS value to 16 bits
     804             :         rms = _mm_packus_epi32(rms, rms /* could be anything */);
     805             : #else
     806             :         // Multiplication of 32 bit values previously converted to 64 bit double
     807          12 :         const auto firstLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineLo));
     808             :         const auto firstLineLoHi =
     809          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
     810          12 :         const auto firstLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(firstLineHi));
     811             :         const auto firstLineHiHi =
     812          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
     813             : 
     814          12 :         const auto secondLineLoLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineLo));
     815             :         const auto secondLineLoHi =
     816          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
     817          12 :         const auto secondLineHiLo = SQUARE_PD(_mm_cvtepi32_pd(secondLineHi));
     818             :         const auto secondLineHiHi =
     819          24 :             SQUARE_PD(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
     820             : 
     821             :         // Vertical addition of squares
     822          12 :         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
     823          12 :         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
     824          12 :         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
     825          12 :         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
     826             : 
     827             :         // Horizontal addition of squares
     828          12 :         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
     829          12 :         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
     830             : 
     831          12 :         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
     832          12 :         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
     833             :         // Take square root and truncate/floor to int32
     834          24 :         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
     835          24 :         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
     836             : 
     837             :         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
     838             :         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
     839             :         //     rms += 1;
     840          12 :         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
     841          12 :         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
     842          24 :         const auto rightLo = _mm_sub_pd(
     843             :             sumDivWeightLo, _mm_add_pd(SQUARE_PD(rmsLoDouble), rmsLoDouble));
     844          36 :         const auto rightHi = _mm_sub_pd(
     845             :             sumDivWeightHi, _mm_add_pd(SQUARE_PD(rmsHiDouble), rmsHiDouble));
     846             : 
     847          24 :         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
     848          12 :         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
     849             :         // The value of the mask will be -1 when the correction needs to be
     850             :         // applied
     851          24 :         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
     852             :             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
     853             : 
     854          48 :         auto rms = _mm_castps_si128(
     855             :             _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
     856             :         // Apply the correction
     857          12 :         rms = _mm_sub_epi32(rms, mask);
     858             : 
     859             :         // Pack each 32 bit RMS value to 16 bits
     860          12 :         rms = sse2_packus_epi32(rms, rms /* could be anything */);
     861             : #endif
     862             : 
     863          12 :         _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     864             :                          rms);
     865          12 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     866             :     }
     867             : 
     868          14 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     869          14 :     return iDstPixel;
     870             : }
     871             : 
     872             : /************************************************************************/
     873             : /*                         AverageUInt16SSE2()                          */
     874             : /************************************************************************/
     875             : 
     876             : static int
     877          13 : AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
     878             :                   const uint16_t *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     879             :                   uint16_t *CPL_RESTRICT pDstScanline)
     880             : {
     881             :     // Optimized implementation for average on UInt16 by
     882             :     // processing by group of 8 output pixels.
     883             : 
     884          13 :     const auto mask = _mm_set1_epi32(0xFFFF);
     885          13 :     const auto two = _mm_set1_epi32(2);
     886          13 :     const uint16_t *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     887             : 
     888          13 :     int iDstPixel = 0;
     889          13 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(mask) / sizeof(uint16_t));
     890          25 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     891             :     {
     892             :         __m128i averageLow;
     893             :         // Load 8 UInt16 from each line
     894             :         {
     895          12 :             const auto firstLine = _mm_loadu_si128(
     896             :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     897             :             const auto secondLine =
     898          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     899          12 :                     pSrcScanlineShifted + nChunkXSize));
     900             : 
     901             :             // Horizontal addition and extension to 32 bit
     902          36 :             const auto horizAddFirstLine = _mm_add_epi32(
     903             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     904             :             const auto horizAddSecondLine =
     905          36 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     906             :                               _mm_srli_epi32(secondLine, 16));
     907             : 
     908             :             // Vertical addition and average computation
     909             :             // average = (sum + 2) >> 2
     910          24 :             const auto sum = _mm_add_epi32(
     911             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     912          12 :             averageLow = _mm_srli_epi32(sum, 2);
     913             :         }
     914             :         // Load 8 UInt16 from each line
     915             :         __m128i averageHigh;
     916             :         {
     917             :             const auto firstLine =
     918          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     919          12 :                     pSrcScanlineShifted + DEST_ELTS));
     920             :             const auto secondLine =
     921          12 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     922          12 :                     pSrcScanlineShifted + DEST_ELTS + nChunkXSize));
     923             : 
     924             :             // Horizontal addition and extension to 32 bit
     925          36 :             const auto horizAddFirstLine = _mm_add_epi32(
     926             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     927             :             const auto horizAddSecondLine =
     928          36 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     929             :                               _mm_srli_epi32(secondLine, 16));
     930             : 
     931             :             // Vertical addition and average computation
     932             :             // average = (sum + 2) >> 2
     933          24 :             const auto sum = _mm_add_epi32(
     934             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     935          12 :             averageHigh = _mm_srli_epi32(sum, 2);
     936             :         }
     937             : 
     938             :         // Pack each 32 bit average value to 16 bits
     939          12 :         auto average = sse2_packus_epi32(averageLow, averageHigh);
     940          12 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     941             :                          average);
     942          12 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     943             :     }
     944             : 
     945          13 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     946          13 :     return iDstPixel;
     947             : }
     948             : 
     949             : /************************************************************************/
     950             : /*                      QuadraticMeanFloatSSE2()                        */
     951             : /************************************************************************/
     952             : 
     953             : #if !defined(ARM_V7)
     954             : 
     955             : #ifdef __SSE3__
     956             : #define sse2_hadd_ps _mm_hadd_ps
     957             : #else
     958          82 : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
     959             : {
     960          82 :     auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
     961          82 :     auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
     962          82 :     return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
     963             : }
     964             : #endif
     965             : 
     966             : #ifdef __AVX2__
     967             : #define set1_ps _mm256_set1_ps
     968             : #define loadu_ps _mm256_loadu_ps
     969             : #define andnot_ps _mm256_andnot_ps
     970             : #define and_ps _mm256_and_ps
     971             : #define max_ps _mm256_max_ps
     972             : #define shuffle_ps _mm256_shuffle_ps
     973             : #define div_ps _mm256_div_ps
     974             : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
     975             : #define mul_ps _mm256_mul_ps
     976             : #define add_ps _mm256_add_ps
     977             : #define hadd_ps _mm256_hadd_ps
     978             : #define sqrt_ps _mm256_sqrt_ps
     979             : #define or_ps _mm256_or_ps
     980             : #define unpacklo_ps _mm256_unpacklo_ps
     981             : #define unpackhi_ps _mm256_unpackhi_ps
     982             : #define storeu_ps _mm256_storeu_ps
     983             : #define blendv_ps _mm256_blendv_ps
     984             : 
     985             : inline __m256 SQUARE_PS(__m256 x)
     986             : {
     987             :     return _mm256_mul_ps(x, x);
     988             : }
     989             : 
     990             : #else
     991             : 
     992             : #define set1_ps _mm_set1_ps
     993             : #define loadu_ps _mm_loadu_ps
     994             : #define andnot_ps _mm_andnot_ps
     995             : #define and_ps _mm_and_ps
     996             : #define max_ps _mm_max_ps
     997             : #define shuffle_ps _mm_shuffle_ps
     998             : #define div_ps _mm_div_ps
     999             : #define cmpeq_ps _mm_cmpeq_ps
    1000             : #define mul_ps _mm_mul_ps
    1001             : #define add_ps _mm_add_ps
    1002             : #define hadd_ps sse2_hadd_ps
    1003             : #define sqrt_ps _mm_sqrt_ps
    1004             : #define or_ps _mm_or_ps
    1005             : #define unpacklo_ps _mm_unpacklo_ps
    1006             : #define unpackhi_ps _mm_unpackhi_ps
    1007             : #define storeu_ps _mm_storeu_ps
    1008             : 
    1009         132 : inline __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
    1010             : {
    1011             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
    1012             :     return _mm_blendv_ps(a, b, mask);
    1013             : #else
    1014         396 :     return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
    1015             : #endif
    1016             : }
    1017             : 
    1018         528 : inline __m128 SQUARE_PS(__m128 x)
    1019             : {
    1020         528 :     return _mm_mul_ps(x, x);
    1021             : }
    1022             : 
    1023         132 : inline __m128 FIXUP_LANES(__m128 x)
    1024             : {
    1025         132 :     return x;
    1026             : }
    1027             : 
    1028             : #endif
    1029             : 
    1030             : static int
    1031             : #if defined(__GNUC__)
    1032             :     __attribute__((noinline))
    1033             : #endif
    1034          66 :     QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
    1035             :                            const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1036             :                            float *CPL_RESTRICT pDstScanline)
    1037             : {
    1038             :     // Optimized implementation for RMS on Float32 by
    1039             :     // processing by group of output pixels.
    1040          66 :     const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1041             : 
    1042          66 :     int iDstPixel = 0;
    1043          66 :     const auto minus_zero = set1_ps(-0.0f);
    1044          66 :     const auto zeroDot25 = set1_ps(0.25f);
    1045          66 :     const auto one = set1_ps(1.0f);
    1046          66 :     const auto infv = set1_ps(std::numeric_limits<float>::infinity());
    1047          66 :     constexpr int DEST_ELTS = static_cast<int>(sizeof(one) / sizeof(float));
    1048             : 
    1049         198 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1050             :     {
    1051             :         // Load 2*DEST_ELTS Float32 from each line
    1052         132 :         auto firstLineLo = loadu_ps(pSrcScanlineShifted);
    1053         132 :         auto firstLineHi = loadu_ps(pSrcScanlineShifted + DEST_ELTS);
    1054         132 :         auto secondLineLo = loadu_ps(pSrcScanlineShifted + nChunkXSize);
    1055             :         auto secondLineHi =
    1056         264 :             loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize);
    1057             : 
    1058             :         // Take the absolute value
    1059         132 :         firstLineLo = andnot_ps(minus_zero, firstLineLo);
    1060         132 :         firstLineHi = andnot_ps(minus_zero, firstLineHi);
    1061         132 :         secondLineLo = andnot_ps(minus_zero, secondLineLo);
    1062         132 :         secondLineHi = andnot_ps(minus_zero, secondLineHi);
    1063             : 
    1064             :         auto firstLineEven =
    1065         132 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1066             :         auto firstLineOdd =
    1067         132 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1068             :         auto secondLineEven =
    1069         132 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1070             :         auto secondLineOdd =
    1071         132 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1072             : 
    1073             :         // Compute the maximum of each DEST_ELTS value to RMS-average
    1074         396 :         const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
    1075             :                                  max_ps(secondLineEven, secondLineEven));
    1076             : 
    1077             :         // Normalize each value by the maximum of the DEST_ELTS ones.
    1078             :         // This step is important to avoid that the square evaluates to infinity
    1079             :         // for sufficiently big input.
    1080         132 :         auto invMax = div_ps(one, maxV);
    1081             :         // Deal with 0 being the maximum to correct division by zero
    1082             :         // note: comparing to -0 leads to identical results as to comparing with
    1083             :         // 0
    1084         264 :         invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
    1085             : 
    1086         132 :         firstLineEven = mul_ps(firstLineEven, invMax);
    1087         132 :         firstLineOdd = mul_ps(firstLineOdd, invMax);
    1088         132 :         secondLineEven = mul_ps(secondLineEven, invMax);
    1089         132 :         secondLineOdd = mul_ps(secondLineOdd, invMax);
    1090             : 
    1091             :         // Compute squares
    1092         132 :         firstLineEven = SQUARE_PS(firstLineEven);
    1093         132 :         firstLineOdd = SQUARE_PS(firstLineOdd);
    1094         132 :         secondLineEven = SQUARE_PS(secondLineEven);
    1095         132 :         secondLineOdd = SQUARE_PS(secondLineOdd);
    1096             : 
    1097         396 :         const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
    1098             :                                        add_ps(secondLineEven, secondLineOdd));
    1099             : 
    1100         396 :         auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
    1101             : 
    1102             :         // Deal with infinity being the maximum
    1103         132 :         const auto maskIsInf = cmpeq_ps(maxV, infv);
    1104         132 :         rms = blendv_ps(rms, infv, maskIsInf);
    1105             : 
    1106         132 :         rms = FIXUP_LANES(rms);
    1107             : 
    1108         132 :         storeu_ps(&pDstScanline[iDstPixel], rms);
    1109         132 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1110             :     }
    1111             : 
    1112          66 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1113          66 :     return iDstPixel;
    1114             : }
    1115             : 
    1116             : /************************************************************************/
    1117             : /*                        AverageFloatSSE2()                            */
    1118             : /************************************************************************/
    1119             : 
    1120          46 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
    1121             :                             const float *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1122             :                             float *CPL_RESTRICT pDstScanline)
    1123             : {
    1124             :     // Optimized implementation for average on Float32 by
    1125             :     // processing by group of output pixels.
    1126          46 :     const float *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1127             : 
    1128          46 :     int iDstPixel = 0;
    1129          46 :     const auto zeroDot25 = _mm_set1_ps(0.25f);
    1130          46 :     constexpr int DEST_ELTS =
    1131             :         static_cast<int>(sizeof(zeroDot25) / sizeof(float));
    1132             : 
    1133         128 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1134             :     {
    1135             :         // Load 2 * DEST_ELTS Float32 from each line
    1136             :         const auto firstLineLo =
    1137          82 :             _mm_mul_ps(_mm_loadu_ps(pSrcScanlineShifted), zeroDot25);
    1138         164 :         const auto firstLineHi = _mm_mul_ps(
    1139             :             _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS), zeroDot25);
    1140          82 :         const auto secondLineLo = _mm_mul_ps(
    1141          82 :             _mm_loadu_ps(pSrcScanlineShifted + nChunkXSize), zeroDot25);
    1142         164 :         const auto secondLineHi = _mm_mul_ps(
    1143          82 :             _mm_loadu_ps(pSrcScanlineShifted + DEST_ELTS + nChunkXSize),
    1144             :             zeroDot25);
    1145             : 
    1146             :         // Vertical addition
    1147          82 :         const auto tmpLo = _mm_add_ps(firstLineLo, secondLineLo);
    1148          82 :         const auto tmpHi = _mm_add_ps(firstLineHi, secondLineHi);
    1149             : 
    1150             :         // Horizontal addition
    1151          82 :         const auto average = sse2_hadd_ps(tmpLo, tmpHi);
    1152             : 
    1153          82 :         _mm_storeu_ps(&pDstScanline[iDstPixel], average);
    1154          82 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1155             :     }
    1156             : 
    1157          46 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1158          46 :     return iDstPixel;
    1159             : }
    1160             : 
    1161             : /************************************************************************/
    1162             : /*                        AverageDoubleSSE2()                           */
    1163             : /************************************************************************/
    1164             : 
    1165             : static int
    1166          50 : AverageDoubleSSE2(int nDstXWidth, int nChunkXSize,
    1167             :                   const double *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1168             :                   double *CPL_RESTRICT pDstScanline)
    1169             : {
    1170             :     // Optimized implementation for average on Float64 by
    1171             :     // processing by group of output pixels.
    1172          50 :     const double *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1173             : 
    1174          50 :     int iDstPixel = 0;
    1175          50 :     const auto zeroDot25 = _mm_set1_pd(0.25);
    1176          50 :     constexpr int DEST_ELTS =
    1177             :         static_cast<int>(sizeof(zeroDot25) / sizeof(double));
    1178             : 
    1179         211 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
    1180             :     {
    1181             :         // Load 4 * DEST_ELTS Float64 from each line
    1182         161 :         const auto firstLine0 = _mm_mul_pd(
    1183             :             _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS), zeroDot25);
    1184         322 :         const auto firstLine1 = _mm_mul_pd(
    1185             :             _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS), zeroDot25);
    1186         161 :         const auto secondLine0 = _mm_mul_pd(
    1187         161 :             _mm_loadu_pd(pSrcScanlineShifted + 0 * DEST_ELTS + nChunkXSize),
    1188             :             zeroDot25);
    1189         322 :         const auto secondLine1 = _mm_mul_pd(
    1190         161 :             _mm_loadu_pd(pSrcScanlineShifted + 1 * DEST_ELTS + nChunkXSize),
    1191             :             zeroDot25);
    1192             : 
    1193             :         // Vertical addition
    1194         161 :         const auto tmp0 = _mm_add_pd(firstLine0, secondLine0);
    1195         161 :         const auto tmp1 = _mm_add_pd(firstLine1, secondLine1);
    1196             : 
    1197             :         // Horizontal addition
    1198         161 :         const auto average0 = sse2_hadd_pd(tmp0, tmp1);
    1199             : 
    1200         161 :         _mm_storeu_pd(&pDstScanline[iDstPixel + 0], average0);
    1201         161 :         pSrcScanlineShifted += DEST_ELTS * 2;
    1202             :     }
    1203             : 
    1204          50 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1205          50 :     return iDstPixel;
    1206             : }
    1207             : 
    1208             : #endif
    1209             : 
    1210             : #endif
    1211             : 
    1212             : /************************************************************************/
    1213             : /*                    GDALResampleChunk_AverageOrRMS()                  */
    1214             : /************************************************************************/
    1215             : 
    1216             : template <class T, class Tsum, GDALDataType eWrkDataType, bool bQuadraticMean>
    1217             : static CPLErr
    1218        2388 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
    1219             :                                  const T *pChunk, void **ppDstBuffer)
    1220             : {
    1221        2388 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1222        2388 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1223        2388 :     const double dfSrcXDelta = args.dfSrcXDelta;
    1224        2388 :     const double dfSrcYDelta = args.dfSrcYDelta;
    1225        2388 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1226        2388 :     const int nChunkXOff = args.nChunkXOff;
    1227        2388 :     const int nChunkYOff = args.nChunkYOff;
    1228        2388 :     const int nChunkXSize = args.nChunkXSize;
    1229        2388 :     const int nChunkYSize = args.nChunkYSize;
    1230        2388 :     const int nDstXOff = args.nDstXOff;
    1231        2388 :     const int nDstXOff2 = args.nDstXOff2;
    1232        2388 :     const int nDstYOff = args.nDstYOff;
    1233        2388 :     const int nDstYOff2 = args.nDstYOff2;
    1234        2388 :     const char *pszResampling = args.pszResampling;
    1235        2388 :     bool bHasNoData = args.bHasNoData;
    1236        2388 :     const double dfNoDataValue = args.dfNoDataValue;
    1237        2388 :     const GDALColorTable *const poColorTable =
    1238             :         !bQuadraticMean &&
    1239             :                 // AVERAGE_BIT2GRAYSCALE
    1240        2311 :                 CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"))
    1241             :             ? nullptr
    1242             :             : args.poColorTable;
    1243        2388 :     const bool bPropagateNoData = args.bPropagateNoData;
    1244             : 
    1245        2388 :     T tNoDataValue = (!bHasNoData) ? 0 : static_cast<T>(dfNoDataValue);
    1246        2388 :     const T tReplacementVal =
    1247         174 :         bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
    1248          56 :                          args.eOvrDataType, dfNoDataValue))
    1249             :                    : 0;
    1250             : 
    1251        2388 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1252        2388 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1253        2388 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    1254             : 
    1255             :     /* -------------------------------------------------------------------- */
    1256             :     /*      Allocate buffers.                                               */
    1257             :     /* -------------------------------------------------------------------- */
    1258        2388 :     *ppDstBuffer = static_cast<T *>(
    1259        2388 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
    1260             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
    1261        2388 :     if (*ppDstBuffer == nullptr)
    1262             :     {
    1263           0 :         return CE_Failure;
    1264             :     }
    1265        2388 :     T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
    1266             : 
    1267             :     struct PrecomputedXValue
    1268             :     {
    1269             :         int nLeftXOffShifted;
    1270             :         int nRightXOffShifted;
    1271             :         double dfLeftWeight;
    1272             :         double dfRightWeight;
    1273             :         double dfTotalWeightFullLine;
    1274             :     };
    1275             : 
    1276             :     PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
    1277        2388 :         VSI_MALLOC2_VERBOSE(nDstXWidth, sizeof(PrecomputedXValue)));
    1278             : 
    1279        2388 :     if (pasSrcX == nullptr)
    1280             :     {
    1281           0 :         return CE_Failure;
    1282             :     }
    1283             : 
    1284        2388 :     std::vector<GDALColorEntry> colorEntries;
    1285             : 
    1286        2388 :     if (poColorTable)
    1287             :     {
    1288           5 :         int nTransparentIdx = -1;
    1289           5 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1290             : 
    1291             :         // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1292             :         // it as nodata value
    1293           6 :         if (bHasNoData && dfNoDataValue >= 0.0 &&
    1294           1 :             tNoDataValue < colorEntries.size())
    1295           1 :             colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
    1296             : 
    1297             :         // Or if we have no explicit nodata, but a color table entry that is
    1298             :         // transparent, consider it as the nodata value
    1299           4 :         else if (!bHasNoData && nTransparentIdx >= 0)
    1300             :         {
    1301           0 :             bHasNoData = true;
    1302           0 :             tNoDataValue = static_cast<T>(nTransparentIdx);
    1303             :         }
    1304             :     }
    1305             : 
    1306             :     /* ==================================================================== */
    1307             :     /*      Precompute inner loop constants.                                */
    1308             :     /* ==================================================================== */
    1309        2388 :     bool bSrcXSpacingIsTwo = true;
    1310        2388 :     int nLastSrcXOff2 = -1;
    1311      856888 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1312             :     {
    1313      854500 :         const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    1314             :         // Apply some epsilon to avoid numerical precision issues
    1315      854500 :         int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    1316      854500 :         const double dfSrcXOff2 =
    1317      854500 :             dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    1318      854500 :         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    1319             : 
    1320      854500 :         if (nSrcXOff < nChunkXOff)
    1321           0 :             nSrcXOff = nChunkXOff;
    1322      854500 :         if (nSrcXOff2 == nSrcXOff)
    1323           0 :             nSrcXOff2++;
    1324      854500 :         if (nSrcXOff2 > nChunkRightXOff)
    1325           1 :             nSrcXOff2 = nChunkRightXOff;
    1326             : 
    1327      854500 :         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
    1328      854500 :         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
    1329      854500 :             nSrcXOff2 - nChunkXOff;
    1330          21 :         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
    1331      854500 :             (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
    1332      854500 :         pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
    1333      854500 :             1 - (nSrcXOff2 - dfSrcXOff2);
    1334      854500 :         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
    1335      854500 :             pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
    1336      854500 :         if (nSrcXOff + 1 < nSrcXOff2)
    1337             :         {
    1338      854479 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1339      854479 :                 nSrcXOff2 - nSrcXOff - 2;
    1340      854479 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1341      854479 :                 pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
    1342             :         }
    1343             : 
    1344      854500 :         if (nSrcXOff2 - nSrcXOff != 2 ||
    1345      733021 :             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
    1346             :         {
    1347      120627 :             bSrcXSpacingIsTwo = false;
    1348             :         }
    1349      854500 :         nLastSrcXOff2 = nSrcXOff2;
    1350             :     }
    1351             : 
    1352             :     /* ==================================================================== */
    1353             :     /*      Loop over destination scanlines.                                */
    1354             :     /* ==================================================================== */
    1355      722538 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1356             :     {
    1357      720150 :         const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    1358      720150 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    1359      720150 :         if (nSrcYOff < nChunkYOff)
    1360           0 :             nSrcYOff = nChunkYOff;
    1361             : 
    1362      720150 :         const double dfSrcYOff2 =
    1363      720150 :             dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    1364      720150 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    1365      720150 :         if (nSrcYOff2 == nSrcYOff)
    1366           0 :             ++nSrcYOff2;
    1367      720150 :         if (nSrcYOff2 > nChunkBottomYOff)
    1368           3 :             nSrcYOff2 = nChunkBottomYOff;
    1369             : 
    1370      720150 :         T *const pDstScanline =
    1371      720150 :             pDstBuffer + static_cast<size_t>(iDstLine - nDstYOff) * nDstXWidth;
    1372             : 
    1373             :         /* --------------------------------------------------------------------
    1374             :          */
    1375             :         /*      Loop over destination pixels */
    1376             :         /* --------------------------------------------------------------------
    1377             :          */
    1378      720150 :         if (poColorTable == nullptr)
    1379             :         {
    1380      720035 :             if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
    1381             :                 pabyChunkNodataMask == nullptr)
    1382             :             {
    1383             :                 if constexpr (eWrkDataType == GDT_Byte ||
    1384             :                               eWrkDataType == GDT_UInt16)
    1385             :                 {
    1386             :                     // Optimized case : no nodata, overview by a factor of 2 and
    1387             :                     // regular x and y src spacing.
    1388      117150 :                     const T *pSrcScanlineShifted =
    1389      117150 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1390      117150 :                         static_cast<size_t>(nSrcYOff - nChunkYOff) *
    1391      117150 :                             nChunkXSize;
    1392      117150 :                     int iDstPixel = 0;
    1393             : #ifdef USE_SSE2
    1394             :                     if constexpr (eWrkDataType == GDT_Byte)
    1395             :                     {
    1396             :                         if constexpr (bQuadraticMean)
    1397             :                         {
    1398        5389 :                             iDstPixel = QuadraticMeanByteSSE2OrAVX2(
    1399             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1400             :                                 pDstScanline);
    1401             :                         }
    1402             :                         else
    1403             :                         {
    1404      111734 :                             iDstPixel = AverageByteSSE2OrAVX2(
    1405             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1406             :                                 pDstScanline);
    1407             :                         }
    1408             :                     }
    1409             :                     else
    1410             :                     {
    1411             :                         static_assert(eWrkDataType == GDT_UInt16);
    1412             :                         if constexpr (bQuadraticMean)
    1413             :                         {
    1414          14 :                             iDstPixel = QuadraticMeanUInt16SSE2(
    1415             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1416             :                                 pDstScanline);
    1417             :                         }
    1418             :                         else
    1419             :                         {
    1420          13 :                             iDstPixel = AverageUInt16SSE2(
    1421             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1422             :                                 pDstScanline);
    1423             :                         }
    1424             :                     }
    1425             : #endif
    1426      291609 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1427             :                     {
    1428      174459 :                         Tsum nTotal = 0;
    1429             :                         T nVal;
    1430             :                         if constexpr (bQuadraticMean)
    1431          52 :                             nTotal =
    1432          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[0]) +
    1433          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[1]) +
    1434          52 :                                 SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
    1435          52 :                                 SQUARE<Tsum>(
    1436          52 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1437             :                         else
    1438      174407 :                             nTotal = pSrcScanlineShifted[0] +
    1439      174407 :                                      pSrcScanlineShifted[1] +
    1440      174407 :                                      pSrcScanlineShifted[nChunkXSize] +
    1441      174407 :                                      pSrcScanlineShifted[1 + nChunkXSize];
    1442             : 
    1443      174459 :                         constexpr int nTotalWeight = 4;
    1444             :                         if constexpr (bQuadraticMean)
    1445          52 :                             nVal = ComputeIntegerRMS_4values<T>(nTotal);
    1446             :                         else
    1447      174407 :                             nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
    1448             :                                                   nTotalWeight);
    1449             : 
    1450             :                         // No need to compare nVal against tNoDataValue as we
    1451             :                         // are in a case where pabyChunkNodataMask == nullptr
    1452             :                         // implies the absence of nodata value.
    1453      174459 :                         pDstScanline[iDstPixel] = nVal;
    1454      174459 :                         pSrcScanlineShifted += 2;
    1455             :                     }
    1456             :                 }
    1457             :                 else
    1458             :                 {
    1459             :                     static_assert(eWrkDataType == GDT_Float32 ||
    1460             :                                   eWrkDataType == GDT_Float64);
    1461         198 :                     const T *pSrcScanlineShifted =
    1462         198 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1463         198 :                         static_cast<size_t>(nSrcYOff - nChunkYOff) *
    1464         198 :                             nChunkXSize;
    1465         198 :                     int iDstPixel = 0;
    1466             : #if defined(USE_SSE2) && !defined(ARM_V7)
    1467             :                     if constexpr (eWrkDataType == GDT_Float32)
    1468             :                     {
    1469             :                         static_assert(std::is_same_v<T, float>);
    1470             :                         if constexpr (bQuadraticMean)
    1471             :                         {
    1472          66 :                             iDstPixel = QuadraticMeanFloatSSE2(
    1473             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1474             :                                 pDstScanline);
    1475             :                         }
    1476             :                         else
    1477             :                         {
    1478          46 :                             iDstPixel = AverageFloatSSE2(
    1479             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1480             :                                 pDstScanline);
    1481             :                         }
    1482             :                     }
    1483             :                     else
    1484             :                     {
    1485             :                         if constexpr (!bQuadraticMean)
    1486             :                         {
    1487          50 :                             iDstPixel = AverageDoubleSSE2(
    1488             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1489             :                                 pDstScanline);
    1490             :                         }
    1491             :                     }
    1492             : #endif
    1493             : 
    1494         714 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1495             :                     {
    1496             :                         T nVal;
    1497             : 
    1498             :                         if constexpr (bQuadraticMean)
    1499             :                         {
    1500             :                             // Avoid issues with large values by renormalizing
    1501          96 :                             const auto max = std::max(
    1502         420 :                                 {std::fabs(pSrcScanlineShifted[0]),
    1503         420 :                                  std::fabs(pSrcScanlineShifted[1]),
    1504         420 :                                  std::fabs(pSrcScanlineShifted[nChunkXSize]),
    1505         420 :                                  std::fabs(
    1506         420 :                                      pSrcScanlineShifted[1 + nChunkXSize])});
    1507         420 :                             if (max == 0)
    1508             :                             {
    1509           8 :                                 nVal = 0;
    1510             :                             }
    1511         412 :                             else if (std::isinf(max))
    1512             :                             {
    1513             :                                 // If there is at least one infinity value,
    1514             :                                 // then just summing, and taking the abs
    1515             :                                 // value will give the expected result:
    1516             :                                 // * +inf if all values are +inf
    1517             :                                 // * +inf if all values are -inf
    1518             :                                 // * NaN otherwise
    1519          82 :                                 nVal = std::fabs(
    1520          82 :                                     pSrcScanlineShifted[0] +
    1521          82 :                                     pSrcScanlineShifted[1] +
    1522          82 :                                     pSrcScanlineShifted[nChunkXSize] +
    1523          82 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1524             :                             }
    1525             :                             else
    1526             :                             {
    1527         330 :                                 const auto inv_max = static_cast<T>(1.0) / max;
    1528         330 :                                 nVal =
    1529             :                                     max *
    1530         330 :                                     std::sqrt(
    1531             :                                         static_cast<T>(0.25) *
    1532         330 :                                         (SQUARE(pSrcScanlineShifted[0] *
    1533         330 :                                                 inv_max) +
    1534         330 :                                          SQUARE(pSrcScanlineShifted[1] *
    1535         330 :                                                 inv_max) +
    1536         330 :                                          SQUARE(
    1537         330 :                                              pSrcScanlineShifted[nChunkXSize] *
    1538         330 :                                              inv_max) +
    1539         330 :                                          SQUARE(
    1540         330 :                                              pSrcScanlineShifted[1 +
    1541             :                                                                  nChunkXSize] *
    1542             :                                              inv_max)));
    1543             :                             }
    1544             :                         }
    1545             :                         else
    1546             :                         {
    1547          96 :                             constexpr auto weight = static_cast<T>(0.25);
    1548             :                             // Multiply each value by weight to avoid
    1549             :                             // potential overflow
    1550          96 :                             nVal =
    1551          96 :                                 (weight * pSrcScanlineShifted[0] +
    1552          96 :                                  weight * pSrcScanlineShifted[1] +
    1553          96 :                                  weight * pSrcScanlineShifted[nChunkXSize] +
    1554          96 :                                  weight * pSrcScanlineShifted[1 + nChunkXSize]);
    1555             :                         }
    1556             : 
    1557             :                         // No need to compare nVal against tNoDataValue as we
    1558             :                         // are in a case where pabyChunkNodataMask == nullptr
    1559             :                         // implies the absence of nodata value.
    1560         516 :                         pDstScanline[iDstPixel] = nVal;
    1561         516 :                         pSrcScanlineShifted += 2;
    1562             :                     }
    1563      117348 :                 }
    1564             :             }
    1565             :             else
    1566             :             {
    1567          18 :                 const double dfBottomWeight =
    1568      602687 :                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0
    1569      602669 :                                                 : 1.0 - (dfSrcYOff - nSrcYOff);
    1570      602687 :                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
    1571      602687 :                 nSrcYOff -= nChunkYOff;
    1572      602687 :                 nSrcYOff2 -= nChunkYOff;
    1573             : 
    1574      602687 :                 double dfTotalWeightFullColumn = dfBottomWeight;
    1575      602687 :                 if (nSrcYOff + 1 < nSrcYOff2)
    1576             :                 {
    1577      602669 :                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
    1578      602669 :                     dfTotalWeightFullColumn += dfTopWeight;
    1579             :                 }
    1580             : 
    1581    18752173 :                 for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1582             :                 {
    1583    18149533 :                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1584    18149533 :                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1585             : 
    1586    18149533 :                     double dfTotal = 0;
    1587    18149533 :                     double dfTotalWeight = 0;
    1588    18149533 :                     [[maybe_unused]] double dfMulFactor = 1.0;
    1589    18149533 :                     [[maybe_unused]] double dfInvMulFactor = 1.0;
    1590    18149533 :                     constexpr bool bUseMulFactor =
    1591             :                         (eWrkDataType == GDT_Float32 ||
    1592             :                          eWrkDataType == GDT_Float64);
    1593    18149533 :                     if (pabyChunkNodataMask == nullptr)
    1594             :                     {
    1595             :                         if constexpr (bUseMulFactor)
    1596             :                         {
    1597             :                             if constexpr (bQuadraticMean)
    1598             :                             {
    1599          80 :                                 T mulFactor = 0;
    1600          80 :                                 auto pChunkShifted =
    1601          80 :                                     pChunk +
    1602          80 :                                     static_cast<size_t>(nSrcYOff) * nChunkXSize;
    1603             : 
    1604         240 :                                 for (int iY = nSrcYOff; iY < nSrcYOff2;
    1605         160 :                                      ++iY, pChunkShifted += nChunkXSize)
    1606             :                                 {
    1607         480 :                                     for (int iX = nSrcXOff; iX < nSrcXOff2;
    1608             :                                          ++iX)
    1609         640 :                                         mulFactor = std::max(
    1610             :                                             mulFactor,
    1611         320 :                                             std::fabs(pChunkShifted[iX]));
    1612             :                                 }
    1613          80 :                                 dfMulFactor = double(mulFactor);
    1614         142 :                                 dfInvMulFactor =
    1615          62 :                                     dfMulFactor > 0 &&
    1616          62 :                                             std::isfinite(dfMulFactor)
    1617             :                                         ? 1.0 / dfMulFactor
    1618             :                                         : 1.0;
    1619             :                             }
    1620             :                             else
    1621             :                             {
    1622         139 :                                 dfMulFactor = (nSrcYOff2 - nSrcYOff) *
    1623         139 :                                               (nSrcXOff2 - nSrcXOff);
    1624         139 :                                 dfInvMulFactor = 1.0 / dfMulFactor;
    1625             :                             }
    1626             :                         }
    1627             : 
    1628     1746545 :                         auto pChunkShifted =
    1629         227 :                             pChunk +
    1630     1746545 :                             static_cast<size_t>(nSrcYOff) * nChunkXSize;
    1631     1746545 :                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
    1632     1746545 :                         double dfWeightY = dfBottomWeight;
    1633     3493539 :                         while (true)
    1634             :                         {
    1635             :                             double dfTotalLine;
    1636             :                             if constexpr (bQuadraticMean)
    1637             :                             {
    1638             :                                 // Left pixel
    1639             :                                 {
    1640         216 :                                     const T val = pChunkShifted[nSrcXOff];
    1641         216 :                                     dfTotalLine =
    1642         216 :                                         SQUARE(double(val) * dfInvMulFactor) *
    1643         216 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1644             :                                 }
    1645             : 
    1646         216 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1647             :                                 {
    1648             :                                     // Middle pixels
    1649         216 :                                     for (int iX = nSrcXOff + 1;
    1650         536 :                                          iX < nSrcXOff2 - 1; ++iX)
    1651             :                                     {
    1652         320 :                                         const T val = pChunkShifted[iX];
    1653         320 :                                         dfTotalLine += SQUARE(double(val) *
    1654             :                                                               dfInvMulFactor);
    1655             :                                     }
    1656             : 
    1657             :                                     // Right pixel
    1658             :                                     {
    1659         216 :                                         const T val =
    1660         216 :                                             pChunkShifted[nSrcXOff2 - 1];
    1661         216 :                                         dfTotalLine +=
    1662         216 :                                             SQUARE(double(val) *
    1663         216 :                                                    dfInvMulFactor) *
    1664         216 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1665             :                                     }
    1666             :                                 }
    1667             :                             }
    1668             :                             else
    1669             :                             {
    1670             :                                 // Left pixel
    1671             :                                 {
    1672     5239868 :                                     const T val = pChunkShifted[nSrcXOff];
    1673     5239868 :                                     dfTotalLine =
    1674     5239868 :                                         double(val) * dfInvMulFactor *
    1675     5239868 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1676             :                                 }
    1677             : 
    1678     5239868 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1679             :                                 {
    1680             :                                     // Middle pixels
    1681     4239442 :                                     for (int iX = nSrcXOff + 1;
    1682    64183238 :                                          iX < nSrcXOff2 - 1; ++iX)
    1683             :                                     {
    1684    59943836 :                                         const T val = pChunkShifted[iX];
    1685    59943836 :                                         dfTotalLine +=
    1686    59943836 :                                             double(val) * dfInvMulFactor;
    1687             :                                     }
    1688             : 
    1689             :                                     // Right pixel
    1690             :                                     {
    1691     4239442 :                                         const T val =
    1692     4239442 :                                             pChunkShifted[nSrcXOff2 - 1];
    1693     4239442 :                                         dfTotalLine +=
    1694     4239442 :                                             double(val) * dfInvMulFactor *
    1695     4239442 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1696             :                                     }
    1697             :                                 }
    1698             :                             }
    1699             : 
    1700     5240084 :                             dfTotal += dfTotalLine * dfWeightY;
    1701     5240084 :                             --nCounterY;
    1702     5240084 :                             if (nCounterY < 0)
    1703     1746545 :                                 break;
    1704     3493539 :                             pChunkShifted += nChunkXSize;
    1705     3493539 :                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
    1706             :                         }
    1707             : 
    1708     1746545 :                         dfTotalWeight =
    1709     1746545 :                             pasSrcX[iDstPixel].dfTotalWeightFullLine *
    1710             :                             dfTotalWeightFullColumn;
    1711             :                     }
    1712             :                     else
    1713             :                     {
    1714    16402998 :                         size_t nCount = 0;
    1715    71753694 :                         for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1716             :                         {
    1717    55350696 :                             const auto pChunkShifted =
    1718    55350696 :                                 pChunk + static_cast<size_t>(iY) * nChunkXSize;
    1719             : 
    1720    55350696 :                             double dfTotalLine = 0;
    1721    55350696 :                             double dfTotalWeightLine = 0;
    1722             :                             // Left pixel
    1723             :                             {
    1724    55350696 :                                 const int iX = nSrcXOff;
    1725    55350696 :                                 const T val = pChunkShifted[iX];
    1726    55350696 :                                 if (pabyChunkNodataMask
    1727    55350696 :                                         [iX +
    1728    55350696 :                                          static_cast<size_t>(iY) * nChunkXSize])
    1729             :                                 {
    1730    23510843 :                                     nCount++;
    1731    23510843 :                                     const double dfWeightX =
    1732    23510843 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1733    23510843 :                                     dfTotalWeightLine = dfWeightX;
    1734             :                                     if constexpr (bQuadraticMean)
    1735          60 :                                         dfTotalLine =
    1736          60 :                                             SQUARE(double(val)) * dfWeightX;
    1737             :                                     else
    1738    23510783 :                                         dfTotalLine = double(val) * dfWeightX;
    1739             :                                 }
    1740             :                             }
    1741             : 
    1742    55350696 :                             if (nSrcXOff < nSrcXOff2 - 1)
    1743             :                             {
    1744             :                                 // Middle pixels
    1745   152871196 :                                 for (int iX = nSrcXOff + 1; iX < nSrcXOff2 - 1;
    1746             :                                      ++iX)
    1747             :                                 {
    1748    97519600 :                                     const T val = pChunkShifted[iX];
    1749    97519600 :                                     if (pabyChunkNodataMask
    1750    97519600 :                                             [iX + static_cast<size_t>(iY) *
    1751    97519600 :                                                       nChunkXSize])
    1752             :                                     {
    1753    39728100 :                                         nCount++;
    1754    39728100 :                                         dfTotalWeightLine += 1;
    1755             :                                         if constexpr (bQuadraticMean)
    1756           0 :                                             dfTotalLine += SQUARE(double(val));
    1757             :                                         else
    1758    39728100 :                                             dfTotalLine += double(val);
    1759             :                                     }
    1760             :                                 }
    1761             : 
    1762             :                                 // Right pixel
    1763             :                                 {
    1764    55351596 :                                     const int iX = nSrcXOff2 - 1;
    1765    55351596 :                                     const T val = pChunkShifted[iX];
    1766    55351596 :                                     if (pabyChunkNodataMask
    1767    55351596 :                                             [iX + static_cast<size_t>(iY) *
    1768    55351596 :                                                       nChunkXSize])
    1769             :                                     {
    1770    23510111 :                                         nCount++;
    1771    23510111 :                                         const double dfWeightX =
    1772    23510111 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1773    23510111 :                                         dfTotalWeightLine += dfWeightX;
    1774             :                                         if constexpr (bQuadraticMean)
    1775          61 :                                             dfTotalLine +=
    1776          61 :                                                 SQUARE(double(val)) * dfWeightX;
    1777             :                                         else
    1778    23510050 :                                             dfTotalLine +=
    1779    23510050 :                                                 double(val) * dfWeightX;
    1780             :                                     }
    1781             :                                 }
    1782             :                             }
    1783             : 
    1784    94310394 :                             const double dfWeightY =
    1785             :                                 (iY == nSrcYOff)        ? dfBottomWeight
    1786    38959698 :                                 : (iY + 1 == nSrcYOff2) ? dfTopWeight
    1787             :                                                         : 1.0;
    1788    55350696 :                             dfTotal += dfTotalLine * dfWeightY;
    1789    55350696 :                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
    1790             :                         }
    1791             : 
    1792    16402998 :                         if (nCount == 0 ||
    1793           8 :                             (bPropagateNoData &&
    1794             :                              nCount <
    1795           8 :                                  static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    1796           8 :                                      (nSrcXOff2 - nSrcXOff)))
    1797             :                         {
    1798     9607412 :                             pDstScanline[iDstPixel] = tNoDataValue;
    1799     9607412 :                             continue;
    1800             :                         }
    1801             :                     }
    1802             :                     if constexpr (eWrkDataType == GDT_Byte)
    1803             :                     {
    1804             :                         T nVal;
    1805             :                         if constexpr (bQuadraticMean)
    1806          38 :                             nVal = ComputeIntegerRMS<T, int>(dfTotal,
    1807             :                                                              dfTotalWeight);
    1808             :                         else
    1809     8541810 :                             nVal =
    1810     8541810 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1811     8541848 :                         if (bHasNoData && nVal == tNoDataValue)
    1812           0 :                             nVal = tReplacementVal;
    1813     8541848 :                         pDstScanline[iDstPixel] = nVal;
    1814             :                     }
    1815             :                     else if constexpr (eWrkDataType == GDT_UInt16)
    1816             :                     {
    1817             :                         T nVal;
    1818             :                         if constexpr (bQuadraticMean)
    1819           4 :                             nVal = ComputeIntegerRMS<T, uint64_t>(
    1820             :                                 dfTotal, dfTotalWeight);
    1821             :                         else
    1822           4 :                             nVal =
    1823           4 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1824           8 :                         if (bHasNoData && nVal == tNoDataValue)
    1825           0 :                             nVal = tReplacementVal;
    1826           8 :                         pDstScanline[iDstPixel] = nVal;
    1827             :                     }
    1828             :                     else
    1829             :                     {
    1830             :                         T nVal;
    1831             :                         if constexpr (bQuadraticMean)
    1832             :                         {
    1833             :                             if constexpr (bUseMulFactor)
    1834          81 :                                 nVal = static_cast<T>(
    1835          48 :                                     dfMulFactor *
    1836          81 :                                     sqrt(dfTotal / dfTotalWeight));
    1837             :                             else
    1838             :                                 nVal = static_cast<T>(
    1839             :                                     sqrt(dfTotal / dfTotalWeight));
    1840             :                         }
    1841             :                         else
    1842             :                         {
    1843             :                             if constexpr (bUseMulFactor)
    1844         184 :                                 nVal = static_cast<T>(
    1845         184 :                                     dfMulFactor * (dfTotal / dfTotalWeight));
    1846             :                             else
    1847             :                                 nVal = static_cast<T>(dfTotal / dfTotalWeight);
    1848             :                         }
    1849         265 :                         if (bHasNoData && nVal == tNoDataValue)
    1850           2 :                             nVal = tReplacementVal;
    1851         265 :                         pDstScanline[iDstPixel] = nVal;
    1852             :                     }
    1853             :                 }
    1854             :             }
    1855             :         }
    1856             :         else
    1857             :         {
    1858         115 :             nSrcYOff -= nChunkYOff;
    1859         115 :             nSrcYOff2 -= nChunkYOff;
    1860             : 
    1861        6590 :             for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1862             :             {
    1863        6475 :                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1864        6475 :                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1865             : 
    1866        6475 :                 uint64_t nTotalR = 0;
    1867        6475 :                 uint64_t nTotalG = 0;
    1868        6475 :                 uint64_t nTotalB = 0;
    1869        6475 :                 size_t nCount = 0;
    1870             : 
    1871       19425 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1872             :                 {
    1873       38850 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    1874             :                     {
    1875       25900 :                         const T val =
    1876       25900 :                             pChunk[iX + static_cast<size_t>(iY) * nChunkXSize];
    1877             :                         // cppcheck-suppress unsignedLessThanZero
    1878       25900 :                         if (val < 0 || val >= colorEntries.size())
    1879           0 :                             continue;
    1880       25900 :                         const size_t idx = static_cast<size_t>(val);
    1881       25900 :                         const auto &entry = colorEntries[idx];
    1882       25900 :                         if (entry.c4)
    1883             :                         {
    1884             :                             if constexpr (bQuadraticMean)
    1885             :                             {
    1886         800 :                                 nTotalR += SQUARE<int>(entry.c1);
    1887         800 :                                 nTotalG += SQUARE<int>(entry.c2);
    1888         800 :                                 nTotalB += SQUARE<int>(entry.c3);
    1889         800 :                                 ++nCount;
    1890             :                             }
    1891             :                             else
    1892             :                             {
    1893       13328 :                                 nTotalR += entry.c1;
    1894       13328 :                                 nTotalG += entry.c2;
    1895       13328 :                                 nTotalB += entry.c3;
    1896       13328 :                                 ++nCount;
    1897             :                             }
    1898             :                         }
    1899             :                     }
    1900             :                 }
    1901             : 
    1902        6475 :                 if (nCount == 0 ||
    1903           0 :                     (bPropagateNoData &&
    1904           0 :                      nCount < static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    1905           0 :                                   (nSrcXOff2 - nSrcXOff)))
    1906             :                 {
    1907        2838 :                     pDstScanline[iDstPixel] = tNoDataValue;
    1908             :                 }
    1909             :                 else
    1910             :                 {
    1911             :                     GDALColorEntry color;
    1912             :                     if constexpr (bQuadraticMean)
    1913             :                     {
    1914         200 :                         color.c1 =
    1915         200 :                             static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
    1916         200 :                         color.c2 =
    1917         200 :                             static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
    1918         200 :                         color.c3 =
    1919         200 :                             static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
    1920             :                     }
    1921             :                     else
    1922             :                     {
    1923        3437 :                         color.c1 =
    1924        3437 :                             static_cast<short>((nTotalR + nCount / 2) / nCount);
    1925        3437 :                         color.c2 =
    1926        3437 :                             static_cast<short>((nTotalG + nCount / 2) / nCount);
    1927        3437 :                         color.c3 =
    1928        3437 :                             static_cast<short>((nTotalB + nCount / 2) / nCount);
    1929             :                     }
    1930        3637 :                     pDstScanline[iDstPixel] =
    1931        3637 :                         static_cast<T>(BestColorEntry(colorEntries, color));
    1932             :                 }
    1933             :             }
    1934             :         }
    1935             :     }
    1936             : 
    1937        2388 :     CPLFree(pasSrcX);
    1938             : 
    1939        2388 :     return CE_None;
    1940             : }
    1941             : 
    1942             : template <bool bQuadraticMean>
    1943             : static CPLErr
    1944        2388 : GDALResampleChunk_AverageOrRMSInternal(const GDALOverviewResampleArgs &args,
    1945             :                                        const void *pChunk, void **ppDstBuffer,
    1946             :                                        GDALDataType *peDstBufferDataType)
    1947             : {
    1948        2388 :     *peDstBufferDataType = args.eWrkDataType;
    1949        2388 :     switch (args.eWrkDataType)
    1950             :     {
    1951        2259 :         case GDT_Byte:
    1952             :         {
    1953             :             return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte,
    1954        2259 :                                                     bQuadraticMean>(
    1955        2259 :                 args, static_cast<const GByte *>(pChunk), ppDstBuffer);
    1956             :         }
    1957             : 
    1958          11 :         case GDT_UInt16:
    1959             :         {
    1960             :             if constexpr (bQuadraticMean)
    1961             :             {
    1962             :                 // Use double as accumulation type, because UInt32 could overflow
    1963             :                 return GDALResampleChunk_AverageOrRMS_T<
    1964           6 :                     GUInt16, double, GDT_UInt16, bQuadraticMean>(
    1965           6 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1966             :             }
    1967             :             else
    1968             :             {
    1969             :                 return GDALResampleChunk_AverageOrRMS_T<
    1970           5 :                     GUInt16, GUInt32, GDT_UInt16, bQuadraticMean>(
    1971           5 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1972             :             }
    1973             :         }
    1974             : 
    1975          71 :         case GDT_Float32:
    1976             :         {
    1977             :             return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32,
    1978          71 :                                                     bQuadraticMean>(
    1979          71 :                 args, static_cast<const float *>(pChunk), ppDstBuffer);
    1980             :         }
    1981             : 
    1982          47 :         case GDT_Float64:
    1983             :         {
    1984             :             return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64,
    1985          47 :                                                     bQuadraticMean>(
    1986          47 :                 args, static_cast<const double *>(pChunk), ppDstBuffer);
    1987             :         }
    1988             : 
    1989           0 :         default:
    1990           0 :             break;
    1991             :     }
    1992             : 
    1993           0 :     CPLAssert(false);
    1994             :     return CE_Failure;
    1995             : }
    1996             : 
    1997             : static CPLErr
    1998        2388 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
    1999             :                                const void *pChunk, void **ppDstBuffer,
    2000             :                                GDALDataType *peDstBufferDataType)
    2001             : {
    2002        2388 :     if (EQUAL(args.pszResampling, "RMS"))
    2003          77 :         return GDALResampleChunk_AverageOrRMSInternal<true>(
    2004          77 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    2005             :     else
    2006        2311 :         return GDALResampleChunk_AverageOrRMSInternal<false>(
    2007        2311 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    2008             : }
    2009             : 
    2010             : /************************************************************************/
    2011             : /*                     GDALResampleChunk_Gauss()                        */
    2012             : /************************************************************************/
    2013             : 
    2014          86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
    2015             :                                       const void *pChunk, void **ppDstBuffer,
    2016             :                                       GDALDataType *peDstBufferDataType)
    2017             : 
    2018             : {
    2019          86 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2020          86 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2021          86 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2022          86 :     const int nChunkXOff = args.nChunkXOff;
    2023          86 :     const int nChunkXSize = args.nChunkXSize;
    2024          86 :     const int nChunkYOff = args.nChunkYOff;
    2025          86 :     const int nChunkYSize = args.nChunkYSize;
    2026          86 :     const int nDstXOff = args.nDstXOff;
    2027          86 :     const int nDstXOff2 = args.nDstXOff2;
    2028          86 :     const int nDstYOff = args.nDstYOff;
    2029          86 :     const int nDstYOff2 = args.nDstYOff2;
    2030          86 :     const bool bHasNoData = args.bHasNoData;
    2031          86 :     double dfNoDataValue = args.dfNoDataValue;
    2032          86 :     const GDALColorTable *poColorTable = args.poColorTable;
    2033             : 
    2034          86 :     const double *const padfChunk = static_cast<const double *>(pChunk);
    2035             : 
    2036          86 :     *ppDstBuffer =
    2037          86 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    2038             :                             GDALGetDataTypeSizeBytes(GDT_Float64));
    2039          86 :     if (*ppDstBuffer == nullptr)
    2040             :     {
    2041           0 :         return CE_Failure;
    2042             :     }
    2043          86 :     *peDstBufferDataType = GDT_Float64;
    2044          86 :     double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
    2045             : 
    2046             :     /* -------------------------------------------------------------------- */
    2047             :     /*      Create the filter kernel and allocate scanline buffer.          */
    2048             :     /* -------------------------------------------------------------------- */
    2049          86 :     int nGaussMatrixDim = 3;
    2050             :     const int *panGaussMatrix;
    2051          86 :     constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
    2052          86 :     constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
    2053             :                                         4,  6, 24, 36, 24, 6, 4,  16, 24,
    2054             :                                         16, 4, 1,  4,  6,  4, 1};
    2055          86 :     constexpr int anGaussMatrix7x7[] = {
    2056             :         1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
    2057             :         6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
    2058             :         120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
    2059             :         90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
    2060             : 
    2061          86 :     const int nOXSize = args.nOvrXSize;
    2062          86 :     const int nOYSize = args.nOvrYSize;
    2063          86 :     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    2064             : 
    2065             :     // matrix for gauss filter
    2066          86 :     if (nResYFactor <= 2)
    2067             :     {
    2068          85 :         panGaussMatrix = anGaussMatrix3x3;
    2069          85 :         nGaussMatrixDim = 3;
    2070             :     }
    2071           1 :     else if (nResYFactor <= 4)
    2072             :     {
    2073           0 :         panGaussMatrix = anGaussMatrix5x5;
    2074           0 :         nGaussMatrixDim = 5;
    2075             :     }
    2076             :     else
    2077             :     {
    2078           1 :         panGaussMatrix = anGaussMatrix7x7;
    2079           1 :         nGaussMatrixDim = 7;
    2080             :     }
    2081             : 
    2082             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2083             :     int *panGaussMatrixDup = static_cast<int *>(
    2084             :         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
    2085             :     memcpy(panGaussMatrixDup, panGaussMatrix,
    2086             :            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
    2087             :     panGaussMatrix = panGaussMatrixDup;
    2088             : #endif
    2089             : 
    2090          86 :     if (!bHasNoData)
    2091          79 :         dfNoDataValue = 0.0;
    2092             : 
    2093          86 :     std::vector<GDALColorEntry> colorEntries;
    2094          86 :     int nTransparentIdx = -1;
    2095          86 :     if (poColorTable)
    2096           2 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    2097             : 
    2098             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    2099             :     // it as nodata value.
    2100          92 :     if (bHasNoData && dfNoDataValue >= 0.0 &&
    2101           6 :         dfNoDataValue < colorEntries.size())
    2102           0 :         colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
    2103             : 
    2104             :     // Or if we have no explicit nodata, but a color table entry that is
    2105             :     // transparent, consider it as the nodata value.
    2106          86 :     else if (!bHasNoData && nTransparentIdx >= 0)
    2107             :     {
    2108           0 :         dfNoDataValue = nTransparentIdx;
    2109             :     }
    2110             : 
    2111          86 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2112          86 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2113          86 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    2114             : 
    2115             :     /* ==================================================================== */
    2116             :     /*      Loop over destination scanlines.                                */
    2117             :     /* ==================================================================== */
    2118       16488 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2119             :     {
    2120       16402 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    2121       16402 :         int nSrcYOff2 =
    2122       16402 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
    2123             : 
    2124       16402 :         if (nSrcYOff < nChunkYOff)
    2125             :         {
    2126           0 :             nSrcYOff = nChunkYOff;
    2127           0 :             nSrcYOff2++;
    2128             :         }
    2129             : 
    2130       16402 :         const int iSizeY = nSrcYOff2 - nSrcYOff;
    2131       16402 :         nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
    2132       16402 :         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
    2133             : 
    2134       16402 :         if (nSrcYOff2 > nChunkBottomYOff ||
    2135       16359 :             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
    2136             :         {
    2137          44 :             nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
    2138             :         }
    2139             : 
    2140       16402 :         int nYShiftGaussMatrix = 0;
    2141       16402 :         if (nSrcYOff < nChunkYOff)
    2142             :         {
    2143           0 :             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
    2144           0 :             nSrcYOff = nChunkYOff;
    2145             :         }
    2146             : 
    2147       16402 :         const double *const padfSrcScanline =
    2148       16402 :             padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    2149       16402 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2150       16402 :         if (pabyChunkNodataMask != nullptr)
    2151         152 :             pabySrcScanlineNodataMask =
    2152         152 :                 pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    2153             : 
    2154             :         /* --------------------------------------------------------------------
    2155             :          */
    2156             :         /*      Loop over destination pixels */
    2157             :         /* --------------------------------------------------------------------
    2158             :          */
    2159       16402 :         double *const padfDstScanline =
    2160       16402 :             padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    2161     4149980 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2162             :         {
    2163     4133580 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    2164     4133580 :             int nSrcXOff2 =
    2165     4133580 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
    2166             : 
    2167     4133580 :             if (nSrcXOff < nChunkXOff)
    2168             :             {
    2169           0 :                 nSrcXOff = nChunkXOff;
    2170           0 :                 nSrcXOff2++;
    2171             :             }
    2172             : 
    2173     4133580 :             const int iSizeX = nSrcXOff2 - nSrcXOff;
    2174     4133580 :             nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
    2175     4133580 :             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
    2176             : 
    2177     4133580 :             if (nSrcXOff2 > nChunkRightXOff ||
    2178     4127930 :                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
    2179             :             {
    2180        5650 :                 nSrcXOff2 =
    2181        5650 :                     std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
    2182             :             }
    2183             : 
    2184     4133580 :             int nXShiftGaussMatrix = 0;
    2185     4133580 :             if (nSrcXOff < nChunkXOff)
    2186             :             {
    2187           0 :                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
    2188           0 :                 nSrcXOff = nChunkXOff;
    2189             :             }
    2190             : 
    2191     4133580 :             if (poColorTable == nullptr)
    2192             :             {
    2193     4133380 :                 double dfTotal = 0.0;
    2194     4133380 :                 GInt64 nCount = 0;
    2195     4133380 :                 const int *panLineWeight =
    2196     4133380 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2197             :                     nXShiftGaussMatrix;
    2198             : 
    2199    16527900 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    2200    12394500 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    2201             :                 {
    2202    49561300 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2203             :                     {
    2204    37166800 :                         const double val =
    2205    37166800 :                             padfSrcScanline[iX - nChunkXOff +
    2206    37166800 :                                             static_cast<GPtrDiff_t>(iY -
    2207    37166800 :                                                                     nSrcYOff) *
    2208    37166800 :                                                 nChunkXSize];
    2209    37166800 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2210       32872 :                             pabySrcScanlineNodataMask[iX - nChunkXOff +
    2211       32872 :                                                       static_cast<GPtrDiff_t>(
    2212       32872 :                                                           iY - nSrcYOff) *
    2213       32872 :                                                           nChunkXSize])
    2214             :                         {
    2215    37146100 :                             const int nWeight = panLineWeight[i];
    2216    37146100 :                             dfTotal += val * nWeight;
    2217    37146100 :                             nCount += nWeight;
    2218             :                         }
    2219             :                     }
    2220             :                 }
    2221             : 
    2222     4133380 :                 if (nCount == 0)
    2223             :                 {
    2224        2217 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2225             :                 }
    2226             :                 else
    2227             :                 {
    2228     4131160 :                     padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
    2229             :                 }
    2230             :             }
    2231             :             else
    2232             :             {
    2233         200 :                 GInt64 nTotalR = 0;
    2234         200 :                 GInt64 nTotalG = 0;
    2235         200 :                 GInt64 nTotalB = 0;
    2236         200 :                 GInt64 nTotalWeight = 0;
    2237         200 :                 const int *panLineWeight =
    2238         200 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2239             :                     nXShiftGaussMatrix;
    2240             : 
    2241         780 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    2242         580 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    2243             :                 {
    2244        2262 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2245             :                     {
    2246        1682 :                         const double val =
    2247        1682 :                             padfSrcScanline[iX - nChunkXOff +
    2248        1682 :                                             static_cast<GPtrDiff_t>(iY -
    2249        1682 :                                                                     nSrcYOff) *
    2250        1682 :                                                 nChunkXSize];
    2251        1682 :                         if (val < 0 || val >= colorEntries.size())
    2252           0 :                             continue;
    2253             : 
    2254        1682 :                         size_t idx = static_cast<size_t>(val);
    2255        1682 :                         if (colorEntries[idx].c4)
    2256             :                         {
    2257        1682 :                             const int nWeight = panLineWeight[i];
    2258        1682 :                             nTotalR +=
    2259        1682 :                                 static_cast<GInt64>(colorEntries[idx].c1) *
    2260        1682 :                                 nWeight;
    2261        1682 :                             nTotalG +=
    2262        1682 :                                 static_cast<GInt64>(colorEntries[idx].c2) *
    2263        1682 :                                 nWeight;
    2264        1682 :                             nTotalB +=
    2265        1682 :                                 static_cast<GInt64>(colorEntries[idx].c3) *
    2266        1682 :                                 nWeight;
    2267        1682 :                             nTotalWeight += nWeight;
    2268             :                         }
    2269             :                     }
    2270             :                 }
    2271             : 
    2272         200 :                 if (nTotalWeight == 0)
    2273             :                 {
    2274           0 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2275             :                 }
    2276             :                 else
    2277             :                 {
    2278             :                     GDALColorEntry color;
    2279             : 
    2280         200 :                     color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
    2281             :                                                   nTotalWeight);
    2282         200 :                     color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
    2283             :                                                   nTotalWeight);
    2284         200 :                     color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
    2285             :                                                   nTotalWeight);
    2286         200 :                     padfDstScanline[iDstPixel - nDstXOff] =
    2287         200 :                         BestColorEntry(colorEntries, color);
    2288             :                 }
    2289             :             }
    2290             :         }
    2291             :     }
    2292             : 
    2293             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2294             :     CPLFree(panGaussMatrixDup);
    2295             : #endif
    2296             : 
    2297          86 :     return CE_None;
    2298             : }
    2299             : 
    2300             : /************************************************************************/
    2301             : /*                      GDALResampleChunk_Mode()                        */
    2302             : /************************************************************************/
    2303             : 
    2304         688 : template <class T> static inline bool IsSame(T a, T b)
    2305             : {
    2306         688 :     return a == b;
    2307             : }
    2308             : 
    2309          60 : template <> bool IsSame<GFloat16>(GFloat16 a, GFloat16 b)
    2310             : {
    2311          60 :     return a == b || (CPLIsNan(a) && CPLIsNan(b));
    2312             : }
    2313             : 
    2314        4902 : template <> bool IsSame<float>(float a, float b)
    2315             : {
    2316        4902 :     return a == b || (std::isnan(a) && std::isnan(b));
    2317             : }
    2318             : 
    2319        1020 : template <> bool IsSame<double>(double a, double b)
    2320             : {
    2321        1020 :     return a == b || (std::isnan(a) && std::isnan(b));
    2322             : }
    2323             : 
    2324             : namespace
    2325             : {
    2326             : struct ComplexFloat16
    2327             : {
    2328             :     GFloat16 r;
    2329             :     GFloat16 i;
    2330             : };
    2331             : }  // namespace
    2332             : 
    2333          60 : template <> bool IsSame<ComplexFloat16>(ComplexFloat16 a, ComplexFloat16 b)
    2334             : {
    2335          90 :     return (a.r == b.r && a.i == b.i) ||
    2336          90 :            (CPLIsNan(a.r) && CPLIsNan(a.i) && CPLIsNan(b.r) && CPLIsNan(b.i));
    2337             : }
    2338             : 
    2339             : template <>
    2340          60 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
    2341             : {
    2342         120 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2343         120 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2344             : }
    2345             : 
    2346             : template <>
    2347          60 : bool IsSame<std::complex<double>>(std::complex<double> a,
    2348             :                                   std::complex<double> b)
    2349             : {
    2350         120 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2351         120 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2352             : }
    2353             : 
    2354             : template <class T>
    2355         176 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
    2356             :                                       const T *pChunk, T *const pDstBuffer)
    2357             : 
    2358             : {
    2359         176 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2360         176 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2361         176 :     const double dfSrcXDelta = args.dfSrcXDelta;
    2362         176 :     const double dfSrcYDelta = args.dfSrcYDelta;
    2363         176 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2364         176 :     const int nChunkXOff = args.nChunkXOff;
    2365         176 :     const int nChunkXSize = args.nChunkXSize;
    2366         176 :     const int nChunkYOff = args.nChunkYOff;
    2367         176 :     const int nChunkYSize = args.nChunkYSize;
    2368         176 :     const int nDstXOff = args.nDstXOff;
    2369         176 :     const int nDstXOff2 = args.nDstXOff2;
    2370         176 :     const int nDstYOff = args.nDstYOff;
    2371         176 :     const int nDstYOff2 = args.nDstYOff2;
    2372         176 :     const bool bHasNoData = args.bHasNoData;
    2373         176 :     const GDALColorTable *poColorTable = args.poColorTable;
    2374         176 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    2375             : 
    2376           8 :     T tNoDataValue;
    2377             :     if constexpr (std::is_same<T, ComplexFloat16>::value)
    2378             :     {
    2379           4 :         tNoDataValue.r = cpl::NumericLimits<GFloat16>::quiet_NaN();
    2380           4 :         tNoDataValue.i = cpl::NumericLimits<GFloat16>::quiet_NaN();
    2381             :     }
    2382             :     else if constexpr (std::is_same<T, std::complex<float>>::value ||
    2383             :                        std::is_same<T, std::complex<double>>::value)
    2384             :     {
    2385             :         using BaseT = typename T::value_type;
    2386           8 :         tNoDataValue =
    2387             :             std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
    2388             :                                 std::numeric_limits<BaseT>::quiet_NaN());
    2389             :     }
    2390         164 :     else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
    2391         163 :         tNoDataValue = 0;
    2392             :     else
    2393           1 :         tNoDataValue = static_cast<T>(args.dfNoDataValue);
    2394             : 
    2395             :     using CountType = uint32_t;
    2396         176 :     CountType nMaxNumPx = 0;
    2397         176 :     T *paVals = nullptr;
    2398         176 :     CountType *panCounts = nullptr;
    2399             : 
    2400         176 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2401         176 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2402         352 :     std::vector<int> anVals(256, 0);
    2403             : 
    2404             :     /* ==================================================================== */
    2405             :     /*      Loop over destination scanlines.                                */
    2406             :     /* ==================================================================== */
    2407        7679 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2408             :     {
    2409        7503 :         const double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    2410        7503 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    2411             : #ifdef only_pixels_with_more_than_10_pct_participation
    2412             :         // When oversampling, don't take into account pixels that have a tiny
    2413             :         // participation in the resulting pixel
    2414             :         if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
    2415             :             nSrcYOff < nChunkBottomYOff)
    2416             :             nSrcYOff++;
    2417             : #endif
    2418        7503 :         if (nSrcYOff < nChunkYOff)
    2419           0 :             nSrcYOff = nChunkYOff;
    2420             : 
    2421        7503 :         const double dfSrcYOff2 =
    2422        7503 :             dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    2423        7503 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    2424             : #ifdef only_pixels_with_more_than_10_pct_participation
    2425             :         // When oversampling, don't take into account pixels that have a tiny
    2426             :         // participation in the resulting pixel
    2427             :         if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
    2428             :             nSrcYOff2 > nChunkYOff)
    2429             :             nSrcYOff2--;
    2430             : #endif
    2431        7503 :         if (nSrcYOff2 == nSrcYOff)
    2432           0 :             ++nSrcYOff2;
    2433        7503 :         if (nSrcYOff2 > nChunkBottomYOff)
    2434           0 :             nSrcYOff2 = nChunkBottomYOff;
    2435             : 
    2436        7503 :         const T *const paSrcScanline =
    2437         253 :             pChunk +
    2438        7503 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
    2439        7503 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2440        7503 :         if (pabyChunkNodataMask != nullptr)
    2441        1810 :             pabySrcScanlineNodataMask =
    2442             :                 pabyChunkNodataMask +
    2443        1810 :                 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
    2444             : 
    2445        7503 :         T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
    2446             :         /* --------------------------------------------------------------------
    2447             :          */
    2448             :         /*      Loop over destination pixels */
    2449             :         /* --------------------------------------------------------------------
    2450             :          */
    2451     4260400 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2452             :         {
    2453     4252893 :             const double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    2454             :             // Apply some epsilon to avoid numerical precision issues
    2455     4252893 :             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    2456             : #ifdef only_pixels_with_more_than_10_pct_participation
    2457             :             // When oversampling, don't take into account pixels that have a
    2458             :             // tiny participation in the resulting pixel
    2459             :             if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
    2460             :                 nSrcXOff < nChunkRightXOff)
    2461             :                 nSrcXOff++;
    2462             : #endif
    2463     4252893 :             if (nSrcXOff < nChunkXOff)
    2464           0 :                 nSrcXOff = nChunkXOff;
    2465             : 
    2466     4252893 :             const double dfSrcXOff2 =
    2467     4252893 :                 dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    2468     4252893 :             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    2469             : #ifdef only_pixels_with_more_than_10_pct_participation
    2470             :             // When oversampling, don't take into account pixels that have a
    2471             :             // tiny participation in the resulting pixel
    2472             :             if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
    2473             :                 nSrcXOff2 > nChunkXOff)
    2474             :                 nSrcXOff2--;
    2475             : #endif
    2476     4252893 :             if (nSrcXOff2 == nSrcXOff)
    2477           0 :                 nSrcXOff2++;
    2478     4252893 :             if (nSrcXOff2 > nChunkRightXOff)
    2479           0 :                 nSrcXOff2 = nChunkRightXOff;
    2480             : 
    2481     4252893 :             bool bRegularProcessing = false;
    2482             :             if constexpr (!std::is_same<T, GByte>::value)
    2483        1503 :                 bRegularProcessing = true;
    2484     4251390 :             else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
    2485           0 :                 bRegularProcessing = true;
    2486             : 
    2487     4252893 :             if (bRegularProcessing)
    2488             :             {
    2489             :                 // Sanity check to make sure the allocation of paVals and
    2490             :                 // panCounts don't overflow.
    2491             :                 static_assert(sizeof(CountType) <= sizeof(size_t));
    2492        3006 :                 if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
    2493        1503 :                     static_cast<CountType>(nSrcYOff2 - nSrcYOff) >
    2494        1503 :                         (std::numeric_limits<CountType>::max() /
    2495        3006 :                          std::max(sizeof(T), sizeof(CountType))) /
    2496        1503 :                             static_cast<CountType>(nSrcXOff2 - nSrcXOff))
    2497             :                 {
    2498           0 :                     CPLError(CE_Failure, CPLE_NotSupported,
    2499             :                              "Too big downsampling factor");
    2500           0 :                     CPLFree(paVals);
    2501           0 :                     CPLFree(panCounts);
    2502           0 :                     return CE_Failure;
    2503             :                 }
    2504        1503 :                 const CountType nNumPx =
    2505        1503 :                     static_cast<CountType>(nSrcYOff2 - nSrcYOff) *
    2506        1503 :                     (nSrcXOff2 - nSrcXOff);
    2507        1503 :                 CountType iMaxInd = 0;
    2508        1503 :                 CountType iMaxVal = 0;
    2509             : 
    2510        1503 :                 if (paVals == nullptr || nNumPx > nMaxNumPx)
    2511             :                 {
    2512             :                     T *paValsNew = static_cast<T *>(
    2513         110 :                         VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
    2514             :                     CountType *panCountsNew =
    2515         110 :                         static_cast<CountType *>(VSI_REALLOC_VERBOSE(
    2516             :                             panCounts, nNumPx * sizeof(CountType)));
    2517         110 :                     if (paValsNew != nullptr)
    2518         110 :                         paVals = paValsNew;
    2519         110 :                     if (panCountsNew != nullptr)
    2520         110 :                         panCounts = panCountsNew;
    2521         110 :                     if (paValsNew == nullptr || panCountsNew == nullptr)
    2522             :                     {
    2523           0 :                         CPLFree(paVals);
    2524           0 :                         CPLFree(panCounts);
    2525           0 :                         return CE_Failure;
    2526             :                     }
    2527         110 :                     nMaxNumPx = nNumPx;
    2528             :                 }
    2529             : 
    2530        4629 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2531             :                 {
    2532        3126 :                     const GPtrDiff_t iTotYOff =
    2533        3126 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2534        3126 :                         nChunkXOff;
    2535        9858 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2536             :                     {
    2537        6732 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2538          16 :                             pabySrcScanlineNodataMask[iX + iTotYOff])
    2539             :                         {
    2540        6717 :                             const T val = paSrcScanline[iX + iTotYOff];
    2541        6717 :                             CountType i = 0;  // Used after for.
    2542             : 
    2543             :                             // Check array for existing entry.
    2544       10081 :                             for (; i < iMaxInd; ++i)
    2545             :                             {
    2546        6850 :                                 if (IsSame(paVals[i], val))
    2547             :                                 {
    2548        3486 :                                     if (++panCounts[i] > panCounts[iMaxVal])
    2549             :                                     {
    2550         246 :                                         iMaxVal = i;
    2551             :                                     }
    2552        3486 :                                     break;
    2553             :                                 }
    2554             :                             }
    2555             : 
    2556             :                             // Add to arr if entry not already there.
    2557        6717 :                             if (i == iMaxInd)
    2558             :                             {
    2559        3231 :                                 paVals[iMaxInd] = val;
    2560        3231 :                                 panCounts[iMaxInd] = 1;
    2561             : 
    2562        3231 :                                 if (iMaxInd == 0)
    2563             :                                 {
    2564        1500 :                                     iMaxVal = iMaxInd;
    2565             :                                 }
    2566             : 
    2567        3231 :                                 ++iMaxInd;
    2568             :                             }
    2569             :                         }
    2570             :                     }
    2571             :                 }
    2572             : 
    2573        1503 :                 if (iMaxInd == 0)
    2574           3 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2575             :                 else
    2576        1500 :                     paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
    2577             :             }
    2578             :             else if constexpr (std::is_same<T, GByte>::value)
    2579             :             // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
    2580             :             {
    2581             :                 // So we go here for a paletted or non-paletted byte band.
    2582             :                 // The input values are then between 0 and 255.
    2583     4251390 :                 int nMaxVal = 0;
    2584     4251390 :                 int iMaxInd = -1;
    2585             : 
    2586             :                 // The cost of this zeroing might be high. Perhaps we should
    2587             :                 // just use the above generic case, and go to this one if the
    2588             :                 // number of source pixels is large enough
    2589     4251390 :                 std::fill(anVals.begin(), anVals.end(), 0);
    2590             : 
    2591    12777800 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2592             :                 {
    2593     8526440 :                     const GPtrDiff_t iTotYOff =
    2594     8526440 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2595     8526440 :                         nChunkXOff;
    2596    25649600 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2597             :                     {
    2598    17123100 :                         const T val = paSrcScanline[iX + iTotYOff];
    2599    17123100 :                         if (!bHasNoData || val != tNoDataValue)
    2600             :                         {
    2601    17123100 :                             int nVal = static_cast<int>(val);
    2602    17123100 :                             if (++anVals[nVal] > nMaxVal)
    2603             :                             {
    2604             :                                 // Sum the density.
    2605             :                                 // Is it the most common value so far?
    2606    17006400 :                                 iMaxInd = nVal;
    2607    17006400 :                                 nMaxVal = anVals[nVal];
    2608             :                             }
    2609             :                         }
    2610             :                     }
    2611             :                 }
    2612             : 
    2613     4251390 :                 if (iMaxInd == -1)
    2614           0 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2615             :                 else
    2616     4251390 :                     paDstScanline[iDstPixel - nDstXOff] =
    2617             :                         static_cast<T>(iMaxInd);
    2618             :             }
    2619             :         }
    2620             :     }
    2621             : 
    2622         176 :     CPLFree(paVals);
    2623         176 :     CPLFree(panCounts);
    2624             : 
    2625         176 :     return CE_None;
    2626             : }
    2627             : 
    2628         176 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
    2629             :                                      const void *pChunk, void **ppDstBuffer,
    2630             :                                      GDALDataType *peDstBufferDataType)
    2631             : {
    2632         176 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    2633             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    2634             :         GDALGetDataTypeSizeBytes(args.eWrkDataType));
    2635         176 :     if (*ppDstBuffer == nullptr)
    2636             :     {
    2637           0 :         return CE_Failure;
    2638             :     }
    2639             : 
    2640         176 :     CPLAssert(args.eSrcDataType == args.eWrkDataType);
    2641             : 
    2642         176 :     *peDstBufferDataType = args.eWrkDataType;
    2643         176 :     switch (args.eWrkDataType)
    2644             :     {
    2645             :         // For mode resampling, as no computation is done, only the
    2646             :         // size of the data type matters... except for Byte where we have
    2647             :         // special processing. And for floating point values
    2648          66 :         case GDT_Byte:
    2649             :         {
    2650          66 :             return GDALResampleChunk_ModeT(args,
    2651             :                                            static_cast<const GByte *>(pChunk),
    2652          66 :                                            static_cast<GByte *>(*ppDstBuffer));
    2653             :         }
    2654             : 
    2655           4 :         case GDT_Int8:
    2656             :         {
    2657           4 :             return GDALResampleChunk_ModeT(args,
    2658             :                                            static_cast<const int8_t *>(pChunk),
    2659           4 :                                            static_cast<int8_t *>(*ppDstBuffer));
    2660             :         }
    2661             : 
    2662          10 :         case GDT_Int16:
    2663             :         case GDT_UInt16:
    2664             :         {
    2665          10 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
    2666          10 :             return GDALResampleChunk_ModeT(
    2667             :                 args, static_cast<const uint16_t *>(pChunk),
    2668          10 :                 static_cast<uint16_t *>(*ppDstBuffer));
    2669             :         }
    2670             : 
    2671          15 :         case GDT_CInt16:
    2672             :         case GDT_Int32:
    2673             :         case GDT_UInt32:
    2674             :         {
    2675          15 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2676          15 :             return GDALResampleChunk_ModeT(
    2677             :                 args, static_cast<const uint32_t *>(pChunk),
    2678          15 :                 static_cast<uint32_t *>(*ppDstBuffer));
    2679             :         }
    2680             : 
    2681          12 :         case GDT_CInt32:
    2682             :         case GDT_Int64:
    2683             :         case GDT_UInt64:
    2684             :         {
    2685          12 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2686          12 :             return GDALResampleChunk_ModeT(
    2687             :                 args, static_cast<const uint64_t *>(pChunk),
    2688          12 :                 static_cast<uint64_t *>(*ppDstBuffer));
    2689             :         }
    2690             : 
    2691           4 :         case GDT_Float16:
    2692             :         {
    2693           4 :             return GDALResampleChunk_ModeT(
    2694             :                 args, static_cast<const GFloat16 *>(pChunk),
    2695           4 :                 static_cast<GFloat16 *>(*ppDstBuffer));
    2696             :         }
    2697             : 
    2698          32 :         case GDT_Float32:
    2699             :         {
    2700          32 :             return GDALResampleChunk_ModeT(args,
    2701             :                                            static_cast<const float *>(pChunk),
    2702          32 :                                            static_cast<float *>(*ppDstBuffer));
    2703             :         }
    2704             : 
    2705          21 :         case GDT_Float64:
    2706             :         {
    2707          21 :             return GDALResampleChunk_ModeT(args,
    2708             :                                            static_cast<const double *>(pChunk),
    2709          21 :                                            static_cast<double *>(*ppDstBuffer));
    2710             :         }
    2711             : 
    2712           4 :         case GDT_CFloat16:
    2713             :         {
    2714           4 :             return GDALResampleChunk_ModeT(
    2715             :                 args, static_cast<const ComplexFloat16 *>(pChunk),
    2716           4 :                 static_cast<ComplexFloat16 *>(*ppDstBuffer));
    2717             :         }
    2718             : 
    2719           4 :         case GDT_CFloat32:
    2720             :         {
    2721           4 :             return GDALResampleChunk_ModeT(
    2722             :                 args, static_cast<const std::complex<float> *>(pChunk),
    2723           4 :                 static_cast<std::complex<float> *>(*ppDstBuffer));
    2724             :         }
    2725             : 
    2726           4 :         case GDT_CFloat64:
    2727             :         {
    2728           4 :             return GDALResampleChunk_ModeT(
    2729             :                 args, static_cast<const std::complex<double> *>(pChunk),
    2730           4 :                 static_cast<std::complex<double> *>(*ppDstBuffer));
    2731             :         }
    2732             : 
    2733           0 :         case GDT_Unknown:
    2734             :         case GDT_TypeCount:
    2735           0 :             break;
    2736             :     }
    2737             : 
    2738           0 :     CPLAssert(false);
    2739             :     return CE_Failure;
    2740             : }
    2741             : 
    2742             : /************************************************************************/
    2743             : /*                  GDALResampleConvolutionHorizontal()                 */
    2744             : /************************************************************************/
    2745             : 
    2746             : template <class T>
    2747             : static inline double
    2748       46038 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
    2749             :                                   int nSrcPixelCount)
    2750             : {
    2751       46038 :     double dfVal1 = 0.0;
    2752       46038 :     double dfVal2 = 0.0;
    2753       46038 :     int i = 0;  // Used after for.
    2754             :     // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
    2755             :     // manually (untypical) unrolled loop in -O2 and -O3:
    2756             :     // https://github.com/OSGeo/gdal/issues/9508
    2757             : #if !defined(__INTEL_CLANG_COMPILER)
    2758       92396 :     for (; i < nSrcPixelCount - 3; i += 4)
    2759             :     {
    2760       46358 :         dfVal1 += double(pChunk[i + 0]) * padfWeights[i];
    2761       46358 :         dfVal1 += double(pChunk[i + 1]) * padfWeights[i + 1];
    2762       46358 :         dfVal2 += double(pChunk[i + 2]) * padfWeights[i + 2];
    2763       46358 :         dfVal2 += double(pChunk[i + 3]) * padfWeights[i + 3];
    2764             :     }
    2765             : #endif
    2766       48662 :     for (; i < nSrcPixelCount; ++i)
    2767             :     {
    2768        2624 :         dfVal1 += double(pChunk[i]) * padfWeights[i];
    2769             :     }
    2770       46038 :     return dfVal1 + dfVal2;
    2771             : }
    2772             : 
    2773             : template <class T>
    2774       44576 : static inline void GDALResampleConvolutionHorizontalWithMask(
    2775             :     const T *pChunk, const GByte *pabyMask, const double *padfWeights,
    2776             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2777             : {
    2778       44576 :     dfVal = 0;
    2779       44576 :     dfWeightSum = 0;
    2780       44576 :     int i = 0;
    2781       98300 :     for (; i < nSrcPixelCount - 3; i += 4)
    2782             :     {
    2783       53724 :         const double dfWeight0 = padfWeights[i] * pabyMask[i];
    2784       53724 :         const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
    2785       53724 :         const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
    2786       53724 :         const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
    2787       53724 :         dfVal += double(pChunk[i + 0]) * dfWeight0;
    2788       53724 :         dfVal += double(pChunk[i + 1]) * dfWeight1;
    2789       53724 :         dfVal += double(pChunk[i + 2]) * dfWeight2;
    2790       53724 :         dfVal += double(pChunk[i + 3]) * dfWeight3;
    2791       53724 :         dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
    2792             :     }
    2793       61162 :     for (; i < nSrcPixelCount; ++i)
    2794             :     {
    2795       16586 :         const double dfWeight = padfWeights[i] * pabyMask[i];
    2796       16586 :         dfVal += double(pChunk[i]) * dfWeight;
    2797       16586 :         dfWeightSum += dfWeight;
    2798             :     }
    2799       44576 : }
    2800             : 
    2801             : template <class T>
    2802     1341366 : static inline void GDALResampleConvolutionHorizontal_3rows(
    2803             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2804             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2805             :     double &dfRes2, double &dfRes3)
    2806             : {
    2807     1341366 :     double dfVal1 = 0.0;
    2808     1341366 :     double dfVal2 = 0.0;
    2809     1341366 :     double dfVal3 = 0.0;
    2810     1341366 :     double dfVal4 = 0.0;
    2811     1341366 :     double dfVal5 = 0.0;
    2812     1341366 :     double dfVal6 = 0.0;
    2813     1341366 :     int i = 0;  // Used after for.
    2814     2736937 :     for (; i < nSrcPixelCount - 3; i += 4)
    2815             :     {
    2816     1395570 :         dfVal1 += double(pChunkRow1[i + 0]) * padfWeights[i + 0];
    2817     1395570 :         dfVal1 += double(pChunkRow1[i + 1]) * padfWeights[i + 1];
    2818     1395570 :         dfVal2 += double(pChunkRow1[i + 2]) * padfWeights[i + 2];
    2819     1395570 :         dfVal2 += double(pChunkRow1[i + 3]) * padfWeights[i + 3];
    2820     1395570 :         dfVal3 += double(pChunkRow2[i + 0]) * padfWeights[i + 0];
    2821     1395570 :         dfVal3 += double(pChunkRow2[i + 1]) * padfWeights[i + 1];
    2822     1395570 :         dfVal4 += double(pChunkRow2[i + 2]) * padfWeights[i + 2];
    2823     1395570 :         dfVal4 += double(pChunkRow2[i + 3]) * padfWeights[i + 3];
    2824     1395570 :         dfVal5 += double(pChunkRow3[i + 0]) * padfWeights[i + 0];
    2825     1395570 :         dfVal5 += double(pChunkRow3[i + 1]) * padfWeights[i + 1];
    2826     1395570 :         dfVal6 += double(pChunkRow3[i + 2]) * padfWeights[i + 2];
    2827     1395570 :         dfVal6 += double(pChunkRow3[i + 3]) * padfWeights[i + 3];
    2828             :     }
    2829     1381377 :     for (; i < nSrcPixelCount; ++i)
    2830             :     {
    2831       40011 :         dfVal1 += double(pChunkRow1[i]) * padfWeights[i];
    2832       40011 :         dfVal3 += double(pChunkRow2[i]) * padfWeights[i];
    2833       40011 :         dfVal5 += double(pChunkRow3[i]) * padfWeights[i];
    2834             :     }
    2835     1341366 :     dfRes1 = dfVal1 + dfVal2;
    2836     1341366 :     dfRes2 = dfVal3 + dfVal4;
    2837     1341366 :     dfRes3 = dfVal5 + dfVal6;
    2838     1341366 : }
    2839             : 
    2840             : template <class T>
    2841       18980 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    2842             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2843             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2844             :     double &dfRes2, double &dfRes3)
    2845             : {
    2846       18980 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2847             :                                             padfWeights, nSrcPixelCount, dfRes1,
    2848             :                                             dfRes2, dfRes3);
    2849       18980 : }
    2850             : 
    2851             : template <class T>
    2852     1256690 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
    2853             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2854             :     const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
    2855             : {
    2856     1256690 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2857             :                                             padfWeights, 4, dfRes1, dfRes2,
    2858             :                                             dfRes3);
    2859     1256690 : }
    2860             : 
    2861             : /************************************************************************/
    2862             : /*                  GDALResampleConvolutionVertical()                   */
    2863             : /************************************************************************/
    2864             : 
    2865             : template <class T>
    2866             : static inline double
    2867      465333 : GDALResampleConvolutionVertical(const T *pChunk, size_t nStride,
    2868             :                                 const double *padfWeights, int nSrcLineCount)
    2869             : {
    2870      465333 :     double dfVal1 = 0.0;
    2871      465333 :     double dfVal2 = 0.0;
    2872      465333 :     int i = 0;
    2873      465333 :     size_t j = 0;
    2874      916410 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2875             :     {
    2876      451077 :         dfVal1 += pChunk[j + 0 * nStride] * padfWeights[i + 0];
    2877      451077 :         dfVal1 += pChunk[j + 1 * nStride] * padfWeights[i + 1];
    2878      451077 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2879      451077 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2880             :     }
    2881      519432 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2882             :     {
    2883       54099 :         dfVal1 += pChunk[j] * padfWeights[i];
    2884             :     }
    2885      465333 :     return dfVal1 + dfVal2;
    2886             : }
    2887             : 
    2888             : template <class T>
    2889     2930610 : static inline void GDALResampleConvolutionVertical_2cols(
    2890             :     const T *pChunk, size_t nStride, const double *padfWeights,
    2891             :     int nSrcLineCount, double &dfRes1, double &dfRes2)
    2892             : {
    2893     2930610 :     double dfVal1 = 0.0;
    2894     2930610 :     double dfVal2 = 0.0;
    2895     2930610 :     double dfVal3 = 0.0;
    2896     2930610 :     double dfVal4 = 0.0;
    2897     2930610 :     int i = 0;
    2898     2930610 :     size_t j = 0;
    2899     5863170 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2900             :     {
    2901     2932560 :         dfVal1 += pChunk[j + 0 + 0 * nStride] * padfWeights[i + 0];
    2902     2932560 :         dfVal3 += pChunk[j + 1 + 0 * nStride] * padfWeights[i + 0];
    2903     2932560 :         dfVal1 += pChunk[j + 0 + 1 * nStride] * padfWeights[i + 1];
    2904     2932560 :         dfVal3 += pChunk[j + 1 + 1 * nStride] * padfWeights[i + 1];
    2905     2932560 :         dfVal2 += pChunk[j + 0 + 2 * nStride] * padfWeights[i + 2];
    2906     2932560 :         dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
    2907     2932560 :         dfVal2 += pChunk[j + 0 + 3 * nStride] * padfWeights[i + 3];
    2908     2932560 :         dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
    2909             :     }
    2910     3053490 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2911             :     {
    2912      122880 :         dfVal1 += pChunk[j + 0] * padfWeights[i];
    2913      122880 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2914             :     }
    2915     2930610 :     dfRes1 = dfVal1 + dfVal2;
    2916     2930610 :     dfRes2 = dfVal3 + dfVal4;
    2917     2930610 : }
    2918             : 
    2919             : #ifdef USE_SSE2
    2920             : 
    2921             : #ifdef __AVX__
    2922             : /************************************************************************/
    2923             : /*             GDALResampleConvolutionVertical_16cols<T>                */
    2924             : /************************************************************************/
    2925             : 
    2926             : template <class T>
    2927             : static inline void
    2928             : GDALResampleConvolutionVertical_16cols(const T *pChunk, size_t nStride,
    2929             :                                        const double *padfWeights,
    2930             :                                        int nSrcLineCount, float *afDest)
    2931             : {
    2932             :     int i = 0;
    2933             :     size_t j = 0;
    2934             :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2935             :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2936             :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2937             :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2938             :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    2939             :     {
    2940             :         XMMReg4Double w0 =
    2941             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2942             :         XMMReg4Double w1 =
    2943             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2944             :         XMMReg4Double w2 =
    2945             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2946             :         XMMReg4Double w3 =
    2947             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2948             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2949             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2950             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
    2951             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
    2952             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2953             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2954             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
    2955             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
    2956             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2957             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2958             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
    2959             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
    2960             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2961             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2962             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
    2963             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
    2964             :     }
    2965             :     for (; i < nSrcLineCount; ++i, j += nStride)
    2966             :     {
    2967             :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2968             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2969             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2970             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
    2971             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
    2972             :     }
    2973             :     v_acc0.Store4Val(afDest);
    2974             :     v_acc1.Store4Val(afDest + 4);
    2975             :     v_acc2.Store4Val(afDest + 8);
    2976             :     v_acc3.Store4Val(afDest + 12);
    2977             : }
    2978             : 
    2979             : template <class T>
    2980             : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
    2981             :                                                           const double *, int,
    2982             :                                                           double *)
    2983             : {
    2984             :     // Cannot be reached
    2985             :     CPLAssert(false);
    2986             : }
    2987             : 
    2988             : #else
    2989             : 
    2990             : /************************************************************************/
    2991             : /*              GDALResampleConvolutionVertical_8cols<T>                */
    2992             : /************************************************************************/
    2993             : 
    2994             : template <class T>
    2995             : static inline void
    2996    24813600 : GDALResampleConvolutionVertical_8cols(const T *pChunk, size_t nStride,
    2997             :                                       const double *padfWeights,
    2998             :                                       int nSrcLineCount, float *afDest)
    2999             : {
    3000    24813600 :     int i = 0;
    3001    24813600 :     size_t j = 0;
    3002    24813600 :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    3003    24821200 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3004    51151600 :     for (; i < nSrcLineCount - 3; i += 4, j += 4 * nStride)
    3005             :     {
    3006    26329600 :         XMMReg4Double w0 =
    3007    26329600 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    3008    26318400 :         XMMReg4Double w1 =
    3009    26318400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    3010    26330100 :         XMMReg4Double w2 =
    3011    26330100 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    3012    26327400 :         XMMReg4Double w3 =
    3013    26327400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    3014    26343700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    3015    26319400 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    3016    26308500 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    3017    26293200 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    3018    26295300 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    3019    26295000 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    3020    26298700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    3021    26319100 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    3022             :     }
    3023    36381700 :     for (; i < nSrcLineCount; ++i, j += nStride)
    3024             :     {
    3025    11559600 :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    3026    11559600 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    3027    11559600 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    3028             :     }
    3029    24822000 :     v_acc0.Store4Val(afDest);
    3030    24805900 :     v_acc1.Store4Val(afDest + 4);
    3031    24832800 : }
    3032             : 
    3033             : template <class T>
    3034             : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
    3035             :                                                          const double *, int,
    3036             :                                                          double *)
    3037             : {
    3038             :     // Cannot be reached
    3039             :     CPLAssert(false);
    3040             : }
    3041             : 
    3042             : #endif  // __AVX__
    3043             : 
    3044             : /************************************************************************/
    3045             : /*              GDALResampleConvolutionHorizontalSSE2<T>                */
    3046             : /************************************************************************/
    3047             : 
    3048             : template <class T>
    3049     3112884 : static inline double GDALResampleConvolutionHorizontalSSE2(
    3050             :     const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3051             : {
    3052     3112884 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3053     3112453 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    3054     3112656 :     int i = 0;  // Used after for.
    3055     3463145 :     for (; i < nSrcPixelCount - 7; i += 8)
    3056             :     {
    3057             :         // Retrieve the pixel & accumulate
    3058      350909 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
    3059      350906 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
    3060      350909 :         const XMMReg4Double v_weight1 =
    3061      350909 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3062      350907 :         const XMMReg4Double v_weight2 =
    3063      350907 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    3064             : 
    3065      350909 :         v_acc1 += v_pixels1 * v_weight1;
    3066      350904 :         v_acc2 += v_pixels2 * v_weight2;
    3067             :     }
    3068             : 
    3069     3112239 :     v_acc1 += v_acc2;
    3070             : 
    3071     3112688 :     double dfVal = v_acc1.GetHorizSum();
    3072    10288540 :     for (; i < nSrcPixelCount; ++i)
    3073             :     {
    3074     7175910 :         dfVal += pChunk[i] * padfWeightsAligned[i];
    3075             :     }
    3076     3112629 :     return dfVal;
    3077             : }
    3078             : 
    3079             : /************************************************************************/
    3080             : /*              GDALResampleConvolutionHorizontal<GByte>                */
    3081             : /************************************************************************/
    3082             : 
    3083             : template <>
    3084     2563970 : inline double GDALResampleConvolutionHorizontal<GByte>(
    3085             :     const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3086             : {
    3087     2563970 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    3088     2563970 :                                                  nSrcPixelCount);
    3089             : }
    3090             : 
    3091             : template <>
    3092      547702 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
    3093             :     const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    3094             : {
    3095      547702 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    3096      548996 :                                                  nSrcPixelCount);
    3097             : }
    3098             : 
    3099             : /************************************************************************/
    3100             : /*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
    3101             : /************************************************************************/
    3102             : 
    3103             : template <class T>
    3104     7042823 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
    3105             :     const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
    3106             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    3107             : {
    3108     7042823 :     int i = 0;  // Used after for.
    3109     7042823 :     XMMReg4Double v_acc = XMMReg4Double::Zero();
    3110     7043103 :     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
    3111    19720821 :     for (; i < nSrcPixelCount - 3; i += 4)
    3112             :     {
    3113    12675158 :         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
    3114    12668458 :         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
    3115    12668858 :         XMMReg4Double v_weight =
    3116    12668858 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3117    12671858 :         v_weight *= v_mask;
    3118    12669458 :         v_acc += v_pixels * v_weight;
    3119    12671158 :         v_acc_weight += v_weight;
    3120             :     }
    3121             : 
    3122     7045673 :     dfVal = v_acc.GetHorizSum();
    3123     7054993 :     dfWeightSum = v_acc_weight.GetHorizSum();
    3124     7287433 :     for (; i < nSrcPixelCount; ++i)
    3125             :     {
    3126      231090 :         const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
    3127      231090 :         dfVal += pChunk[i] * dfWeight;
    3128      231090 :         dfWeightSum += dfWeight;
    3129             :     }
    3130     7056343 : }
    3131             : 
    3132             : /************************************************************************/
    3133             : /*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
    3134             : /************************************************************************/
    3135             : 
    3136             : template <>
    3137     7051180 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
    3138             :     const GByte *pChunk, const GByte *pabyMask,
    3139             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    3140             :     double &dfWeightSum)
    3141             : {
    3142     7051180 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    3143             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    3144             :         dfWeightSum);
    3145     7057360 : }
    3146             : 
    3147             : template <>
    3148          63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
    3149             :     const GUInt16 *pChunk, const GByte *pabyMask,
    3150             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    3151             :     double &dfWeightSum)
    3152             : {
    3153          63 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    3154             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    3155             :         dfWeightSum);
    3156          63 : }
    3157             : 
    3158             : /************************************************************************/
    3159             : /*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
    3160             : /************************************************************************/
    3161             : 
    3162             : template <class T>
    3163    32036886 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
    3164             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3165             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3166             :     double &dfRes2, double &dfRes3)
    3167             : {
    3168    32036886 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
    3169    32022986 :                   v_acc2 = XMMReg4Double::Zero(),
    3170    32031986 :                   v_acc3 = XMMReg4Double::Zero();
    3171    32042886 :     int i = 0;
    3172    63897056 :     for (; i < nSrcPixelCount - 7; i += 8)
    3173             :     {
    3174             :         // Retrieve the pixel & accumulate.
    3175    31881770 :         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3176    31880970 :         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
    3177    31888770 :         const XMMReg4Double v_weight1 =
    3178    31888770 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3179    31842970 :         const XMMReg4Double v_weight2 =
    3180    31842970 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    3181             : 
    3182    31857570 :         v_acc1 += v_pixels1 * v_weight1;
    3183    31843070 :         v_acc1 += v_pixels2 * v_weight2;
    3184             : 
    3185    31844770 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3186    31855970 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
    3187    31868870 :         v_acc2 += v_pixels1 * v_weight1;
    3188    31836070 :         v_acc2 += v_pixels2 * v_weight2;
    3189             : 
    3190    31841370 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3191    31871170 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
    3192    31871370 :         v_acc3 += v_pixels1 * v_weight1;
    3193    31832870 :         v_acc3 += v_pixels2 * v_weight2;
    3194             :     }
    3195             : 
    3196    32015286 :     dfRes1 = v_acc1.GetHorizSum();
    3197    32038086 :     dfRes2 = v_acc2.GetHorizSum();
    3198    32040986 :     dfRes3 = v_acc3.GetHorizSum();
    3199    44201152 :     for (; i < nSrcPixelCount; ++i)
    3200             :     {
    3201    12156866 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3202    12156866 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3203    12156866 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3204             :     }
    3205    32044286 : }
    3206             : 
    3207             : /************************************************************************/
    3208             : /*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
    3209             : /************************************************************************/
    3210             : 
    3211             : template <>
    3212    32055500 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
    3213             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3214             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3215             :     double &dfRes2, double &dfRes3)
    3216             : {
    3217    32055500 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    3218             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3219             :         dfRes1, dfRes2, dfRes3);
    3220    32020000 : }
    3221             : 
    3222             : template <>
    3223          86 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
    3224             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3225             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3226             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3227             : {
    3228          86 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    3229             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3230             :         dfRes1, dfRes2, dfRes3);
    3231          86 : }
    3232             : 
    3233             : /************************************************************************/
    3234             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
    3235             : /************************************************************************/
    3236             : 
    3237             : template <class T>
    3238     7122126 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3239             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3240             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3241             :     double &dfRes2, double &dfRes3)
    3242             : {
    3243     7122126 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    3244     7111515 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    3245     7113506 :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    3246     7116117 :     int i = 0;  // Use after for.
    3247    16899139 :     for (; i < nSrcPixelCount - 3; i += 4)
    3248             :     {
    3249             :         // Retrieve the pixel & accumulate.
    3250     9791660 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3251     9799940 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3252     9776680 :         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3253     9812570 :         const XMMReg4Double v_weight =
    3254     9812570 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3255             : 
    3256     9778350 :         v_acc1 += v_pixels1 * v_weight;
    3257     9753840 :         v_acc2 += v_pixels2 * v_weight;
    3258     9762090 :         v_acc3 += v_pixels3 * v_weight;
    3259             :     }
    3260             : 
    3261     7107469 :     dfRes1 = v_acc1.GetHorizSum();
    3262     7114146 :     dfRes2 = v_acc2.GetHorizSum();
    3263     7112909 :     dfRes3 = v_acc3.GetHorizSum();
    3264             : 
    3265    11551389 :     for (; i < nSrcPixelCount; ++i)
    3266             :     {
    3267     4438504 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3268     4438504 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3269     4438504 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3270             :     }
    3271     7112865 : }
    3272             : 
    3273             : /************************************************************************/
    3274             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
    3275             : /************************************************************************/
    3276             : 
    3277             : template <>
    3278     7057950 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
    3279             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3280             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3281             :     double &dfRes2, double &dfRes3)
    3282             : {
    3283     7057950 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3284             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3285             :         dfRes1, dfRes2, dfRes3);
    3286     7045740 : }
    3287             : 
    3288             : template <>
    3289       67039 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
    3290             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3291             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3292             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3293             : {
    3294       67039 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3295             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3296             :         dfRes1, dfRes2, dfRes3);
    3297       67109 : }
    3298             : 
    3299             : /************************************************************************/
    3300             : /*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
    3301             : /************************************************************************/
    3302             : 
    3303             : template <class T>
    3304    13862460 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3305             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3306             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3307             :     double &dfRes3)
    3308             : {
    3309    13862460 :     const XMMReg4Double v_weight =
    3310             :         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
    3311             : 
    3312             :     // Retrieve the pixel & accumulate.
    3313    13920490 :     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
    3314    13938010 :     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
    3315    13867750 :     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
    3316             : 
    3317    13938240 :     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
    3318    13906500 :     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
    3319    13902380 :     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
    3320             : 
    3321    13899710 :     dfRes1 = v_acc1.GetHorizSum();
    3322    13895130 :     dfRes2 = v_acc2.GetHorizSum();
    3323    13928400 :     dfRes3 = v_acc3.GetHorizSum();
    3324    13910420 : }
    3325             : 
    3326             : /************************************************************************/
    3327             : /*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
    3328             : /************************************************************************/
    3329             : 
    3330             : template <>
    3331     8268080 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
    3332             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3333             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3334             :     double &dfRes3)
    3335             : {
    3336     8268080 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3337             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3338             :         dfRes3);
    3339     8242110 : }
    3340             : 
    3341             : template <>
    3342     5680720 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
    3343             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3344             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
    3345             :     double &dfRes2, double &dfRes3)
    3346             : {
    3347     5680720 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3348             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3349             :         dfRes3);
    3350     5651960 : }
    3351             : 
    3352             : #endif  // USE_SSE2
    3353             : 
    3354             : /************************************************************************/
    3355             : /*                    GDALResampleChunk_Convolution()                   */
    3356             : /************************************************************************/
    3357             : 
    3358             : template <class T, class Twork, GDALDataType eWrkDataType,
    3359             :           bool bKernelWithNegativeWeights, bool bNeedRescale>
    3360        5030 : static CPLErr GDALResampleChunk_ConvolutionT(
    3361             :     const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
    3362             :     FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
    3363             :     int nKernelRadius, float fMaxVal)
    3364             : 
    3365             : {
    3366        5030 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    3367        5030 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    3368        5030 :     const double dfSrcXDelta = args.dfSrcXDelta;
    3369        5030 :     const double dfSrcYDelta = args.dfSrcYDelta;
    3370        5030 :     constexpr int nBands = 1;
    3371        5030 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    3372        5030 :     const int nChunkXOff = args.nChunkXOff;
    3373        5030 :     const int nChunkXSize = args.nChunkXSize;
    3374        5030 :     const int nChunkYOff = args.nChunkYOff;
    3375        5030 :     const int nChunkYSize = args.nChunkYSize;
    3376        5030 :     const int nDstXOff = args.nDstXOff;
    3377        5030 :     const int nDstXOff2 = args.nDstXOff2;
    3378        5030 :     const int nDstYOff = args.nDstYOff;
    3379        5030 :     const int nDstYOff2 = args.nDstYOff2;
    3380        5030 :     const bool bHasNoData = args.bHasNoData;
    3381        5030 :     double dfNoDataValue = args.dfNoDataValue;
    3382             : 
    3383        5030 :     if (!bHasNoData)
    3384        4955 :         dfNoDataValue = 0.0;
    3385        5030 :     const auto dstDataType = args.eOvrDataType;
    3386        5030 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
    3387        5030 :     const double dfReplacementVal =
    3388          75 :         bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
    3389             :                    : dfNoDataValue;
    3390             :     // cppcheck-suppress unreadVariable
    3391        5030 :     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
    3392        5021 :     const bool bNoDataValueInt64Valid =
    3393        5028 :         isIntegerDT && GDALIsValueExactAs<GInt64>(dfNoDataValue);
    3394        5021 :     const auto nNodataValueInt64 =
    3395             :         bNoDataValueInt64Valid ? static_cast<GInt64>(dfNoDataValue) : 0;
    3396        5021 :     constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
    3397             : 
    3398             :     // TODO: we should have some generic function to do this.
    3399        5021 :     Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
    3400        5021 :     Twork fDstMax = cpl::NumericLimits<Twork>::max();
    3401        5021 :     if (dstDataType == GDT_Byte)
    3402             :     {
    3403        4164 :         fDstMin = std::numeric_limits<GByte>::min();
    3404        4161 :         fDstMax = std::numeric_limits<GByte>::max();
    3405             :     }
    3406         859 :     else if (dstDataType == GDT_Int8)
    3407             :     {
    3408           1 :         fDstMin = std::numeric_limits<GInt8>::min();
    3409           1 :         fDstMax = std::numeric_limits<GInt8>::max();
    3410             :     }
    3411         858 :     else if (dstDataType == GDT_UInt16)
    3412             :     {
    3413         396 :         fDstMin = std::numeric_limits<GUInt16>::min();
    3414         393 :         fDstMax = std::numeric_limits<GUInt16>::max();
    3415             :     }
    3416         465 :     else if (dstDataType == GDT_Int16)
    3417             :     {
    3418         291 :         fDstMin = std::numeric_limits<GInt16>::min();
    3419         291 :         fDstMax = std::numeric_limits<GInt16>::max();
    3420             :     }
    3421         174 :     else if (dstDataType == GDT_UInt32)
    3422             :     {
    3423           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
    3424           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
    3425             :     }
    3426         173 :     else if (dstDataType == GDT_Int32)
    3427             :     {
    3428             :         // cppcheck-suppress unreadVariable
    3429           2 :         fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
    3430             :         // cppcheck-suppress unreadVariable
    3431           2 :         fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
    3432             :     }
    3433         171 :     else if (dstDataType == GDT_UInt64)
    3434             :     {
    3435             :         // cppcheck-suppress unreadVariable
    3436           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
    3437             :         // cppcheck-suppress unreadVariable
    3438             :         // (1 << 64) - 2048: largest uint64 value a double can hold
    3439           1 :         fDstMax = static_cast<Twork>(18446744073709549568ULL);
    3440             :     }
    3441         170 :     else if (dstDataType == GDT_Int64)
    3442             :     {
    3443             :         // cppcheck-suppress unreadVariable
    3444           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
    3445             :         // cppcheck-suppress unreadVariable
    3446             :         // (1 << 63) - 1024: largest int64 that a double can hold
    3447           1 :         fDstMax = static_cast<Twork>(9223372036854774784LL);
    3448             :     }
    3449             : 
    3450    37229031 :     auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
    3451             :                                bNoDataValueInt64Valid, nNodataValueInt64,
    3452             :                                dfNoDataValue, dfReplacementVal](Twork fVal)
    3453             :     {
    3454    16155000 :         if (!bHasNoData)
    3455    11940300 :             return fVal;
    3456             : 
    3457             :         // Clamp value before comparing to nodata: this is only needed for
    3458             :         // kernels with negative weights (Lanczos)
    3459     4214720 :         Twork fClamped = fVal;
    3460     4214720 :         if (fClamped < fDstMin)
    3461       15998 :             fClamped = fDstMin;
    3462     4198730 :         else if (fClamped > fDstMax)
    3463       16406 :             fClamped = fDstMax;
    3464     4214720 :         if (isIntegerDT)
    3465             :         {
    3466     4184520 :             if (bNoDataValueInt64Valid)
    3467             :             {
    3468     4192860 :                 const double fClampedRounded = double(std::round(fClamped));
    3469     8426020 :                 if (fClampedRounded >=
    3470             :                         static_cast<double>(static_cast<Twork>(
    3471     8425320 :                             std::numeric_limits<int64_t>::min())) &&
    3472             :                     fClampedRounded <= static_cast<double>(static_cast<Twork>(
    3473     8424440 :                                            9223372036854774784LL)) &&
    3474     4211640 :                     nNodataValueInt64 ==
    3475     4212460 :                         static_cast<GInt64>(std::round(fClamped)))
    3476             :                 {
    3477             :                     // Do not use the nodata value
    3478       14435 :                     return static_cast<Twork>(dfReplacementVal);
    3479             :                 }
    3480             :             }
    3481             :         }
    3482       30202 :         else if (dfNoDataValue == static_cast<double>(fClamped))
    3483             :         {
    3484             :             // Do not use the nodata value
    3485           1 :             return static_cast<Twork>(dfReplacementVal);
    3486             :         }
    3487     4220020 :         return fClamped;
    3488             :     };
    3489             : 
    3490             :     /* -------------------------------------------------------------------- */
    3491             :     /*      Allocate work buffers.                                          */
    3492             :     /* -------------------------------------------------------------------- */
    3493        5024 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    3494        5024 :     Twork *pafWrkScanline = nullptr;
    3495        5024 :     if (dstDataType != eWrkDataType)
    3496             :     {
    3497             :         pafWrkScanline =
    3498        4856 :             static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
    3499        4861 :         if (pafWrkScanline == nullptr)
    3500           0 :             return CE_Failure;
    3501             :     }
    3502             : 
    3503        5029 :     const double dfXScale = 1.0 / dfXRatioDstToSrc;
    3504        5029 :     const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
    3505        5029 :     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
    3506        5029 :     const double dfYScale = 1.0 / dfYRatioDstToSrc;
    3507        5029 :     const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
    3508        5029 :     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
    3509             : 
    3510             :     // Temporary array to store result of horizontal filter.
    3511             :     double *const padfHorizontalFiltered = static_cast<double *>(
    3512        5029 :         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
    3513             : 
    3514             :     // To store convolution coefficients.
    3515             :     double *const padfWeights =
    3516        5032 :         static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
    3517             :             static_cast<int>(
    3518             :                 2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) + 0.5) *
    3519             :             sizeof(double)));
    3520             : 
    3521        5031 :     GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
    3522        5031 :     if (pabyChunkNodataMask)
    3523             :         pabyChunkNodataMaskHorizontalFiltered =
    3524         462 :             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
    3525        5031 :     if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
    3526         462 :         (pabyChunkNodataMask != nullptr &&
    3527             :          pabyChunkNodataMaskHorizontalFiltered == nullptr))
    3528             :     {
    3529           0 :         VSIFree(pafWrkScanline);
    3530           0 :         VSIFree(padfHorizontalFiltered);
    3531           0 :         VSIFreeAligned(padfWeights);
    3532           0 :         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3533           0 :         return CE_Failure;
    3534             :     }
    3535             : 
    3536             :     /* ==================================================================== */
    3537             :     /*      First pass: horizontal filter                                   */
    3538             :     /* ==================================================================== */
    3539        5031 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    3540             : #ifdef USE_SSE2
    3541        5031 :     const bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
    3542             : #endif
    3543     3025812 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    3544             :     {
    3545     3020784 :         const double dfSrcPixel =
    3546     3020784 :             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
    3547     3020784 :         int nSrcPixelStart =
    3548     3020784 :             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
    3549     3020784 :         if (nSrcPixelStart < nChunkXOff)
    3550       57238 :             nSrcPixelStart = nChunkXOff;
    3551     3020784 :         int nSrcPixelStop =
    3552     3020784 :             static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
    3553     3020784 :         if (nSrcPixelStop > nChunkRightXOff)
    3554       57258 :             nSrcPixelStop = nChunkRightXOff;
    3555             : #if 0
    3556             :         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
    3557             :         {
    3558             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3559             :         }
    3560             :         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
    3561             :         {
    3562             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3563             :         }
    3564             : #endif
    3565     3020784 :         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
    3566     3020784 :         double dfWeightSum = 0.0;
    3567             : 
    3568             :         // Compute convolution coefficients.
    3569     3020784 :         int nSrcPixel = nSrcPixelStart;
    3570     3020784 :         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
    3571     4375084 :         for (; nSrcPixel < nSrcPixelStop - 3; nSrcPixel += 4)
    3572             :         {
    3573     1354097 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
    3574     1354097 :             dfX += dfXScaleWeight;
    3575     1354097 :             padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
    3576     1354097 :             dfX += dfXScaleWeight;
    3577     1354097 :             padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
    3578     1354097 :             dfX += dfXScaleWeight;
    3579     1354097 :             padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
    3580     1354097 :             dfX += dfXScaleWeight;
    3581     1354302 :             dfWeightSum +=
    3582     1354097 :                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
    3583             :         }
    3584     7011598 :         for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
    3585             :         {
    3586     3990747 :             const double dfWeight = pfnFilterFunc(dfX);
    3587     3990609 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
    3588     3990609 :             dfWeightSum += dfWeight;
    3589             :         }
    3590             : 
    3591     3020851 :         const int nHeight = nChunkYSize * nBands;
    3592     3020851 :         if (pabyChunkNodataMask == nullptr)
    3593             :         {
    3594             :             // For floating-point data types, we must scale down a bit values
    3595             :             // if input values are close to +/- std::numeric_limits<T>::max()
    3596             : #ifdef OLD_CPPCHECK
    3597             :             constexpr double mulFactor = 1;
    3598             : #else
    3599     2933099 :             constexpr double mulFactor =
    3600             :                 (bNeedRescale &&
    3601             :                  (std::is_same_v<T, float> || std::is_same_v<T, double>))
    3602             :                     ? 2
    3603             :                     : 1;
    3604             : #endif
    3605             : 
    3606     2933099 :             if (dfWeightSum != 0)
    3607             :             {
    3608     2933104 :                 const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
    3609    11692164 :                 for (int i = 0; i < nSrcPixelCount; ++i)
    3610             :                 {
    3611     8759063 :                     padfWeights[i] *= dfInvWeightSum;
    3612             :                 }
    3613             :             }
    3614             : 
    3615   166101460 :             const auto ScaleValue = [
    3616             : #ifdef _MSC_VER
    3617             :                                         mulFactor
    3618             : #endif
    3619             :             ](double dfVal, [[maybe_unused]] const T *inputValues,
    3620             :                                     [[maybe_unused]] int nInputValues)
    3621             :             {
    3622   166101000 :                 constexpr bool isFloat =
    3623             :                     std::is_same_v<T, float> || std::is_same_v<T, double>;
    3624             :                 if constexpr (isFloat)
    3625             :                 {
    3626     4070140 :                     if (std::isfinite(dfVal))
    3627             :                     {
    3628             :                         return std::clamp(dfVal,
    3629    12204800 :                                           -std::numeric_limits<double>::max() /
    3630             :                                               mulFactor,
    3631     4068260 :                                           std::numeric_limits<double>::max() /
    3632     4068260 :                                               mulFactor) *
    3633     4068260 :                                mulFactor;
    3634             :                     }
    3635             :                     else if constexpr (bKernelWithNegativeWeights)
    3636             :                     {
    3637         936 :                         if (std::isnan(dfVal))
    3638             :                         {
    3639             :                             // Either one of the input value is NaN or they are +/-Inf
    3640         936 :                             const bool isPositive = inputValues[0] >= 0;
    3641        6008 :                             for (int i = 0; i < nInputValues; ++i)
    3642             :                             {
    3643        5384 :                                 if (std::isnan(inputValues[i]))
    3644         312 :                                     return dfVal;
    3645             :                                 // cppcheck-suppress knownConditionTrueFalse
    3646        5072 :                                 if ((inputValues[i] >= 0) != isPositive)
    3647           0 :                                     return dfVal;
    3648             :                             }
    3649             :                             // All values are positive or negative infinity
    3650         624 :                             return static_cast<double>(inputValues[0]);
    3651             :                         }
    3652             :                     }
    3653             :                 }
    3654   162032000 :                 return dfVal;
    3655             :             };
    3656             : 
    3657     2933099 :             int iSrcLineOff = 0;
    3658             : #ifdef USE_SSE2
    3659     2933099 :             if (nSrcPixelCount == 4)
    3660             :             {
    3661    15788535 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3662             :                 {
    3663    15169866 :                     const size_t j =
    3664    15169866 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3665    15169866 :                         (nSrcPixelStart - nChunkXOff);
    3666    15169866 :                     double dfVal1 = 0.0;
    3667    15169866 :                     double dfVal2 = 0.0;
    3668    15169866 :                     double dfVal3 = 0.0;
    3669    15169866 :                     GDALResampleConvolutionHorizontalPixelCount4_3rows(
    3670    15169866 :                         pChunk + j, pChunk + j + nChunkXSize,
    3671    15169866 :                         pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
    3672             :                         dfVal2, dfVal3);
    3673    30346746 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3674    15159066 :                                                nDstXSize +
    3675    15159066 :                                            iDstPixel - nDstXOff] =
    3676    15159066 :                         ScaleValue(dfVal1, pChunk + j, 4);
    3677    30360746 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3678    15187746 :                                             1) *
    3679    15187746 :                                                nDstXSize +
    3680    15187746 :                                            iDstPixel - nDstXOff] =
    3681    15187746 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize, 4);
    3682    15175245 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3683    15172966 :                                             2) *
    3684    15172966 :                                                nDstXSize +
    3685    15172966 :                                            iDstPixel - nDstXOff] =
    3686    15172966 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize, 4);
    3687             :                 }
    3688             :             }
    3689     2319407 :             else if (bSrcPixelCountLess8)
    3690             :             {
    3691     9200102 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3692             :                 {
    3693     7137561 :                     const size_t j =
    3694     7137561 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3695     7137561 :                         (nSrcPixelStart - nChunkXOff);
    3696     7137561 :                     double dfVal1 = 0.0;
    3697     7137561 :                     double dfVal2 = 0.0;
    3698     7137561 :                     double dfVal3 = 0.0;
    3699     7137561 :                     GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    3700     7137561 :                         pChunk + j, pChunk + j + nChunkXSize,
    3701     7137561 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3702             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3703    14268458 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3704     7134556 :                                                nDstXSize +
    3705     7134556 :                                            iDstPixel - nDstXOff] =
    3706     7134556 :                         ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
    3707    14268527 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3708     7133902 :                                             1) *
    3709     7133902 :                                                nDstXSize +
    3710     7133902 :                                            iDstPixel - nDstXOff] =
    3711     7133902 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize,
    3712             :                                    nSrcPixelCount);
    3713     7135683 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3714     7134625 :                                             2) *
    3715     7134625 :                                                nDstXSize +
    3716     7134625 :                                            iDstPixel - nDstXOff] =
    3717     7134625 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
    3718             :                                    nSrcPixelCount);
    3719             :                 }
    3720             :             }
    3721             :             else
    3722             : #endif
    3723             :             {
    3724    32386265 :                 for (; iSrcLineOff < nHeight - 2; iSrcLineOff += 3)
    3725             :                 {
    3726    32112644 :                     const size_t j =
    3727    32112644 :                         static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3728    32112644 :                         (nSrcPixelStart - nChunkXOff);
    3729    32112644 :                     double dfVal1 = 0.0;
    3730    32112644 :                     double dfVal2 = 0.0;
    3731    32112644 :                     double dfVal3 = 0.0;
    3732    32112644 :                     GDALResampleConvolutionHorizontal_3rows(
    3733    32112644 :                         pChunk + j, pChunk + j + nChunkXSize,
    3734    32112644 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3735             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3736    64216398 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3737    32110144 :                                                nDstXSize +
    3738    32110144 :                                            iDstPixel - nDstXOff] =
    3739    32110144 :                         ScaleValue(dfVal1, pChunk + j, nSrcPixelCount);
    3740    64229498 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3741    32106244 :                                             1) *
    3742    32106244 :                                                nDstXSize +
    3743    32106244 :                                            iDstPixel - nDstXOff] =
    3744    32106244 :                         ScaleValue(dfVal2, pChunk + j + nChunkXSize,
    3745             :                                    nSrcPixelCount);
    3746    32196480 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3747    32123244 :                                             2) *
    3748    32123244 :                                                nDstXSize +
    3749    32123244 :                                            iDstPixel - nDstXOff] =
    3750    32123244 :                         ScaleValue(dfVal3, pChunk + j + 2 * nChunkXSize,
    3751             :                                    nSrcPixelCount);
    3752             :                 }
    3753             :             }
    3754     6091994 :             for (; iSrcLineOff < nHeight; ++iSrcLineOff)
    3755             :             {
    3756     3158930 :                 const size_t j =
    3757     3158930 :                     static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3758     3158930 :                     (nSrcPixelStart - nChunkXOff);
    3759     3707934 :                 const double dfVal = GDALResampleConvolutionHorizontal(
    3760      594956 :                     pChunk + j, padfWeights, nSrcPixelCount);
    3761     3159492 :                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3762     3159019 :                                            nDstXSize +
    3763     3159019 :                                        iDstPixel - nDstXOff] =
    3764     3159019 :                     ScaleValue(dfVal, pChunk + j, nSrcPixelCount);
    3765             :             }
    3766             :         }
    3767             :         else
    3768             :         {
    3769    20501636 :             for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
    3770             :             {
    3771    20413730 :                 const size_t j =
    3772    20413730 :                     static_cast<size_t>(iSrcLineOff) * nChunkXSize +
    3773    20413730 :                     (nSrcPixelStart - nChunkXOff);
    3774             : 
    3775             :                 if (bKernelWithNegativeWeights)
    3776             :                 {
    3777    19888612 :                     int nConsecutiveValid = 0;
    3778    19888612 :                     int nMaxConsecutiveValid = 0;
    3779   181872458 :                     for (int k = 0; k < nSrcPixelCount; k++)
    3780             :                     {
    3781   161985146 :                         if (pabyChunkNodataMask[j + k])
    3782    48863253 :                             nConsecutiveValid++;
    3783   113121793 :                         else if (nConsecutiveValid)
    3784             :                         {
    3785      106953 :                             nMaxConsecutiveValid = std::max(
    3786      107790 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3787      106953 :                             nConsecutiveValid = 0;
    3788             :                         }
    3789             :                     }
    3790    19888212 :                     nMaxConsecutiveValid =
    3791    19887812 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3792    19888212 :                     if (nMaxConsecutiveValid < nSrcPixelCount / 2)
    3793             :                     {
    3794    13314907 :                         const size_t nTempOffset =
    3795    13314907 :                             static_cast<size_t>(iSrcLineOff) * nDstXSize +
    3796    13314907 :                             iDstPixel - nDstXOff;
    3797    13314907 :                         padfHorizontalFiltered[nTempOffset] = 0.0;
    3798    13314907 :                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3799    13314907 :                         continue;
    3800             :                     }
    3801             :                 }
    3802             : 
    3803     7098433 :                 double dfVal = 0.0;
    3804     7098433 :                 GDALResampleConvolutionHorizontalWithMask(
    3805       44639 :                     pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3806             :                     nSrcPixelCount, dfVal, dfWeightSum);
    3807     7098835 :                 const size_t nTempOffset =
    3808     7098835 :                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
    3809     7098835 :                     nDstXOff;
    3810     7098835 :                 if (dfWeightSum > 0.0)
    3811             :                 {
    3812     7043358 :                     padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
    3813     7043358 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
    3814             :                 }
    3815             :                 else
    3816             :                 {
    3817       55532 :                     padfHorizontalFiltered[nTempOffset] = 0.0;
    3818       55532 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3819             :                 }
    3820             :             }
    3821             :         }
    3822             :     }
    3823             : 
    3824             :     /* ==================================================================== */
    3825             :     /*      Second pass: vertical filter                                    */
    3826             :     /* ==================================================================== */
    3827        5032 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    3828             : 
    3829      376391 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3830             :     {
    3831      371359 :         Twork *const pafDstScanline =
    3832             :             pafWrkScanline
    3833      371359 :                 ? pafWrkScanline
    3834        8797 :                 : static_cast<Twork *>(pDstBuffer) +
    3835        8797 :                       static_cast<size_t>(iDstLine - nDstYOff) * nDstXSize;
    3836             : 
    3837      371359 :         const double dfSrcLine =
    3838      371359 :             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
    3839      371359 :         int nSrcLineStart =
    3840      371359 :             static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
    3841      371359 :         int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
    3842      371359 :         if (nSrcLineStart < nChunkYOff)
    3843        3361 :             nSrcLineStart = nChunkYOff;
    3844      371359 :         if (nSrcLineStop > nChunkBottomYOff)
    3845        3405 :             nSrcLineStop = nChunkBottomYOff;
    3846             : #if 0
    3847             :         if( nSrcLineStart < nChunkYOff &&
    3848             :             nChunkYOff > 0 )
    3849             :         {
    3850             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3851             :         }
    3852             :         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
    3853             :         {
    3854             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3855             :         }
    3856             : #endif
    3857      371359 :         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
    3858      371359 :         double dfWeightSum = 0.0;
    3859             : 
    3860             :         // Compute convolution coefficients.
    3861      371359 :         int nSrcLine = nSrcLineStart;  // Used after for.
    3862      371359 :         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
    3863      943548 :         for (; nSrcLine < nSrcLineStop - 3;
    3864      572189 :              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
    3865             :         {
    3866      572191 :             padfWeights[nSrcLine - nSrcLineStart] = dfY;
    3867      572191 :             padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
    3868      572191 :             padfWeights[nSrcLine + 2 - nSrcLineStart] =
    3869      572191 :                 dfY + 2 * dfYScaleWeight;
    3870      572191 :             padfWeights[nSrcLine + 3 - nSrcLineStart] =
    3871      572191 :                 dfY + 3 * dfYScaleWeight;
    3872      572189 :             dfWeightSum +=
    3873      572191 :                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
    3874             :         }
    3875      409056 :         for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
    3876             :         {
    3877       37706 :             const double dfWeight = pfnFilterFunc(dfY);
    3878       37699 :             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
    3879       37699 :             dfWeightSum += dfWeight;
    3880             :         }
    3881             : 
    3882      371350 :         if (pabyChunkNodataMask == nullptr)
    3883             :         {
    3884             :             // For floating-point data types, we must scale down a bit values
    3885             :             // if input values are close to +/- std::numeric_limits<T>::max()
    3886             : #ifdef OLD_CPPCHECK
    3887             :             constexpr double mulFactor = 1;
    3888             : #else
    3889      332360 :             constexpr double mulFactor =
    3890             :                 (bNeedRescale &&
    3891             :                  (std::is_same_v<T, float> || std::is_same_v<T, double>))
    3892             :                     ? 2
    3893             :                     : 1;
    3894             : #endif
    3895             : 
    3896      332360 :             if (dfWeightSum != 0)
    3897             :             {
    3898      332360 :                 const double dfInvWeightSum = 1.0 / (mulFactor * dfWeightSum);
    3899     2386153 :                 for (int i = 0; i < nSrcLineCount; ++i)
    3900     2053785 :                     padfWeights[i] *= dfInvWeightSum;
    3901             :             }
    3902             : 
    3903      332360 :             int iFilteredPixelOff = 0;  // Used after for.
    3904             :             // j used after for.
    3905      332360 :             size_t j =
    3906      332360 :                 (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
    3907             : #ifdef USE_SSE2
    3908             :             if constexpr ((!bNeedRescale ||
    3909             :                            !std::is_same_v<T, float>)&&eWrkDataType ==
    3910             :                           GDT_Float32)
    3911             :             {
    3912             : #ifdef __AVX__
    3913             :                 for (; iFilteredPixelOff < nDstXSize - 15;
    3914             :                      iFilteredPixelOff += 16, j += 16)
    3915             :                 {
    3916             :                     GDALResampleConvolutionVertical_16cols(
    3917             :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3918             :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3919             :                     if (bHasNoData)
    3920             :                     {
    3921             :                         for (int k = 0; k < 16; k++)
    3922             :                         {
    3923             :                             pafDstScanline[iFilteredPixelOff + k] =
    3924             :                                 replaceValIfNodata(
    3925             :                                     pafDstScanline[iFilteredPixelOff + k]);
    3926             :                         }
    3927             :                     }
    3928             :                 }
    3929             : #else
    3930    25148897 :                 for (; iFilteredPixelOff < nDstXSize - 7;
    3931             :                      iFilteredPixelOff += 8, j += 8)
    3932             :                 {
    3933    24854008 :                     GDALResampleConvolutionVertical_8cols(
    3934    24854008 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3935    24854008 :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3936    24825338 :                     if (bHasNoData)
    3937             :                     {
    3938      123192 :                         for (int k = 0; k < 8; k++)
    3939             :                         {
    3940      109504 :                             pafDstScanline[iFilteredPixelOff + k] =
    3941      109504 :                                 replaceValIfNodata(
    3942      109504 :                                     pafDstScanline[iFilteredPixelOff + k]);
    3943             :                         }
    3944             :                     }
    3945             :                 }
    3946             : #endif
    3947             : 
    3948      758986 :                 for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
    3949             :                 {
    3950      464121 :                     const Twork fVal =
    3951      464085 :                         static_cast<Twork>(GDALResampleConvolutionVertical(
    3952      464085 :                             padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3953             :                             nSrcLineCount));
    3954      464076 :                     pafDstScanline[iFilteredPixelOff] =
    3955      464121 :                         replaceValIfNodata(fVal);
    3956             :                 }
    3957             :             }
    3958             :             else
    3959             : #endif
    3960             :             {
    3961     5862642 :                 const auto ScaleValue = [
    3962             : #ifdef _MSC_VER
    3963             :                                             mulFactor
    3964             : #endif
    3965             :                 ](double dfVal, [[maybe_unused]] const double *inputValues,
    3966             :                                         [[maybe_unused]] int nStride,
    3967             :                                         [[maybe_unused]] int nInputValues)
    3968             :                 {
    3969     5862640 :                     constexpr bool isFloat =
    3970             :                         std::is_same_v<T, float> || std::is_same_v<T, double>;
    3971             :                     if constexpr (isFloat)
    3972             :                     {
    3973     5862640 :                         if (std::isfinite(dfVal))
    3974             :                         {
    3975             :                             return std::clamp(
    3976             :                                        dfVal,
    3977             :                                        static_cast<double>(
    3978    17585400 :                                            -std::numeric_limits<Twork>::max()) /
    3979             :                                            mulFactor,
    3980             :                                        static_cast<double>(
    3981     5861800 :                                            std::numeric_limits<Twork>::max()) /
    3982     5861800 :                                            mulFactor) *
    3983     5861800 :                                    mulFactor;
    3984             :                         }
    3985             :                         else if constexpr (bKernelWithNegativeWeights)
    3986             :                         {
    3987         480 :                             if (std::isnan(dfVal))
    3988             :                             {
    3989             :                                 // Either one of the input value is NaN or they are +/-Inf
    3990         480 :                                 const bool isPositive = inputValues[0] >= 0;
    3991        2520 :                                 for (int i = 0; i < nInputValues; ++i)
    3992             :                                 {
    3993        2200 :                                     if (std::isnan(inputValues[i * nStride]))
    3994         160 :                                         return dfVal;
    3995             :                                     // cppcheck-suppress knownConditionTrueFalse
    3996        2040 :                                     if ((inputValues[i] >= 0) != isPositive)
    3997           0 :                                         return dfVal;
    3998             :                                 }
    3999             :                                 // All values are positive or negative infinity
    4000         320 :                                 return inputValues[0];
    4001             :                             }
    4002             :                         }
    4003             :                     }
    4004             : 
    4005         360 :                     return dfVal;
    4006             :                 };
    4007             : 
    4008     2939422 :                 for (; iFilteredPixelOff < nDstXSize - 1;
    4009             :                      iFilteredPixelOff += 2, j += 2)
    4010             :                 {
    4011     2930610 :                     double dfVal1 = 0.0;
    4012     2930610 :                     double dfVal2 = 0.0;
    4013     2930610 :                     GDALResampleConvolutionVertical_2cols(
    4014     2930610 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4015             :                         nSrcLineCount, dfVal1, dfVal2);
    4016     5861220 :                     pafDstScanline[iFilteredPixelOff] =
    4017     2930610 :                         replaceValIfNodata(static_cast<Twork>(
    4018     2930610 :                             ScaleValue(dfVal1, padfHorizontalFiltered + j,
    4019             :                                        nDstXSize, nSrcLineCount)));
    4020     2930610 :                     pafDstScanline[iFilteredPixelOff + 1] =
    4021     2930610 :                         replaceValIfNodata(static_cast<Twork>(
    4022     2930610 :                             ScaleValue(dfVal2, padfHorizontalFiltered + j + 1,
    4023             :                                        nDstXSize, nSrcLineCount)));
    4024             :                 }
    4025        8819 :                 if (iFilteredPixelOff < nDstXSize)
    4026             :                 {
    4027        1427 :                     const double dfVal = GDALResampleConvolutionVertical(
    4028        1427 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    4029             :                         nSrcLineCount);
    4030        1427 :                     pafDstScanline[iFilteredPixelOff] =
    4031        1427 :                         replaceValIfNodata(static_cast<Twork>(
    4032        1427 :                             ScaleValue(dfVal, padfHorizontalFiltered + j,
    4033             :                                        nDstXSize, nSrcLineCount)));
    4034             :                 }
    4035             :             }
    4036             :         }
    4037             :         else
    4038             :         {
    4039    19012048 :             for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
    4040             :                  ++iFilteredPixelOff)
    4041             :             {
    4042    18962057 :                 double dfVal = 0.0;
    4043    18962057 :                 dfWeightSum = 0.0;
    4044    18962057 :                 size_t j = (nSrcLineStart - nChunkYOff) *
    4045    18962057 :                                static_cast<size_t>(nDstXSize) +
    4046    18962057 :                            iFilteredPixelOff;
    4047             :                 if (bKernelWithNegativeWeights)
    4048             :                 {
    4049    18718501 :                     int nConsecutiveValid = 0;
    4050    18718501 :                     int nMaxConsecutiveValid = 0;
    4051   133044321 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    4052             :                     {
    4053   114303020 :                         const double dfWeight =
    4054   114303020 :                             padfWeights[i] *
    4055             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    4056   114303020 :                         if (pabyChunkNodataMaskHorizontalFiltered[j])
    4057             :                         {
    4058    48584037 :                             nConsecutiveValid++;
    4059             :                         }
    4060    65719283 :                         else if (nConsecutiveValid)
    4061             :                         {
    4062      226934 :                             nMaxConsecutiveValid = std::max(
    4063      204376 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    4064      226934 :                             nConsecutiveValid = 0;
    4065             :                         }
    4066   114326020 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    4067   114326020 :                         dfWeightSum += dfWeight;
    4068             :                     }
    4069    18740601 :                     nMaxConsecutiveValid =
    4070    18741001 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    4071    18740601 :                     if (nMaxConsecutiveValid < nSrcLineCount / 2)
    4072             :                     {
    4073     9246271 :                         pafDstScanline[iFilteredPixelOff] =
    4074     9246179 :                             static_cast<Twork>(dfNoDataValue);
    4075     9246271 :                         continue;
    4076             :                     }
    4077             :                 }
    4078             :                 else
    4079             :                 {
    4080     1237062 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    4081             :                     {
    4082      993504 :                         const double dfWeight =
    4083      993504 :                             padfWeights[i] *
    4084             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    4085      993504 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    4086      993504 :                         dfWeightSum += dfWeight;
    4087             :                     }
    4088             :                 }
    4089     9737866 :                 if (dfWeightSum > 0.0)
    4090             :                 {
    4091     9710707 :                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
    4092     9721825 :                         static_cast<Twork>(dfVal / dfWeightSum));
    4093             :                 }
    4094             :                 else
    4095             :                 {
    4096       16036 :                     pafDstScanline[iFilteredPixelOff] =
    4097       16012 :                         static_cast<Twork>(dfNoDataValue);
    4098             :                 }
    4099             :             }
    4100             :         }
    4101             : 
    4102      353681 :         if (fMaxVal != 0.0f)
    4103             :         {
    4104             :             if constexpr (std::is_same_v<T, double>)
    4105             :             {
    4106           0 :                 for (int i = 0; i < nDstXSize; ++i)
    4107             :                 {
    4108           0 :                     if (pafDstScanline[i] > static_cast<double>(fMaxVal))
    4109           0 :                         pafDstScanline[i] = static_cast<double>(fMaxVal);
    4110             :                 }
    4111             :             }
    4112             :             else
    4113             :             {
    4114      192324 :                 for (int i = 0; i < nDstXSize; ++i)
    4115             :                 {
    4116      192088 :                     if (pafDstScanline[i] > fMaxVal)
    4117       96022 :                         pafDstScanline[i] = fMaxVal;
    4118             :                 }
    4119             :             }
    4120             :         }
    4121             : 
    4122      353681 :         if (pafWrkScanline)
    4123             :         {
    4124      362563 :             GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
    4125             :                             static_cast<GByte *>(pDstBuffer) +
    4126      362563 :                                 static_cast<size_t>(iDstLine - nDstYOff) *
    4127      362563 :                                     nDstXSize * nDstDataTypeSize,
    4128             :                             dstDataType, nDstDataTypeSize, nDstXSize);
    4129             :         }
    4130             :     }
    4131             : 
    4132        5032 :     VSIFree(pafWrkScanline);
    4133        5032 :     VSIFreeAligned(padfWeights);
    4134        5032 :     VSIFree(padfHorizontalFiltered);
    4135        5032 :     VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    4136             : 
    4137        5032 :     return CE_None;
    4138             : }
    4139             : 
    4140             : template <bool bKernelWithNegativeWeights, bool bNeedRescale>
    4141             : static CPLErr
    4142        5032 : GDALResampleChunk_ConvolutionInternal(const GDALOverviewResampleArgs &args,
    4143             :                                       const void *pChunk, void **ppDstBuffer,
    4144             :                                       GDALDataType *peDstBufferDataType)
    4145             : {
    4146             :     GDALResampleAlg eResample;
    4147        5032 :     if (EQUAL(args.pszResampling, "BILINEAR"))
    4148        2660 :         eResample = GRA_Bilinear;
    4149        2372 :     else if (EQUAL(args.pszResampling, "CUBIC"))
    4150        2219 :         eResample = GRA_Cubic;
    4151         153 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    4152          59 :         eResample = GRA_CubicSpline;
    4153          94 :     else if (EQUAL(args.pszResampling, "LANCZOS"))
    4154          90 :         eResample = GRA_Lanczos;
    4155             :     else
    4156             :     {
    4157           4 :         CPLAssert(false);
    4158             :         return CE_Failure;
    4159             :     }
    4160        5028 :     const int nKernelRadius = GWKGetFilterRadius(eResample);
    4161        5026 :     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
    4162             :     const FilterFunc4ValuesType pfnFilterFunc4Values =
    4163        5028 :         GWKGetFilterFunc4Values(eResample);
    4164             : 
    4165        5025 :     float fMaxVal = 0.f;
    4166             :     // Cubic, etc... can have overshoots, so make sure we clamp values to the
    4167             :     // maximum value if NBITS is set.
    4168        5025 :     if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
    4169           8 :         (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
    4170           0 :          args.eOvrDataType == GDT_UInt32))
    4171             :     {
    4172           8 :         int nBits = args.nOvrNBITS;
    4173           8 :         if (nBits == GDALGetDataTypeSizeBits(args.eOvrDataType))
    4174           1 :             nBits = 0;
    4175           8 :         if (nBits > 0 && nBits < 32)
    4176           7 :             fMaxVal = static_cast<float>((1U << nBits) - 1);
    4177             :     }
    4178             : 
    4179        5025 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    4180             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    4181             :         GDALGetDataTypeSizeBytes(args.eOvrDataType));
    4182        5032 :     if (*ppDstBuffer == nullptr)
    4183             :     {
    4184           0 :         return CE_Failure;
    4185             :     }
    4186        5032 :     *peDstBufferDataType = args.eOvrDataType;
    4187             : 
    4188        5032 :     switch (args.eWrkDataType)
    4189             :     {
    4190        4164 :         case GDT_Byte:
    4191             :         {
    4192             :             return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32,
    4193             :                                                   bKernelWithNegativeWeights,
    4194        4164 :                                                   bNeedRescale>(
    4195             :                 args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
    4196        4164 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4197             :         }
    4198             : 
    4199         402 :         case GDT_UInt16:
    4200             :         {
    4201             :             return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32,
    4202             :                                                   bKernelWithNegativeWeights,
    4203         402 :                                                   bNeedRescale>(
    4204             :                 args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
    4205         402 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4206             :         }
    4207             : 
    4208         375 :         case GDT_Float32:
    4209             :         {
    4210             :             return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32,
    4211             :                                                   bKernelWithNegativeWeights,
    4212         375 :                                                   bNeedRescale>(
    4213             :                 args, static_cast<const float *>(pChunk), *ppDstBuffer,
    4214         375 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4215             :         }
    4216             : 
    4217          91 :         case GDT_Float64:
    4218             :         {
    4219             :             return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64,
    4220             :                                                   bKernelWithNegativeWeights,
    4221          91 :                                                   bNeedRescale>(
    4222             :                 args, static_cast<const double *>(pChunk), *ppDstBuffer,
    4223          91 :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius, fMaxVal);
    4224             :         }
    4225             : 
    4226           0 :         default:
    4227           0 :             break;
    4228             :     }
    4229             : 
    4230           0 :     CPLAssert(false);
    4231             :     return CE_Failure;
    4232             : }
    4233             : 
    4234             : static CPLErr
    4235        5032 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
    4236             :                               const void *pChunk, void **ppDstBuffer,
    4237             :                               GDALDataType *peDstBufferDataType)
    4238             : {
    4239        5032 :     if (EQUAL(args.pszResampling, "CUBIC") ||
    4240        2809 :         EQUAL(args.pszResampling, "LANCZOS"))
    4241             :         return GDALResampleChunk_ConvolutionInternal<
    4242        2313 :             /* bKernelWithNegativeWeights=*/true, /* bNeedRescale = */ true>(
    4243        2313 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4244        2719 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    4245          59 :         return GDALResampleChunk_ConvolutionInternal<false, true>(
    4246          59 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4247             :     else
    4248        2660 :         return GDALResampleChunk_ConvolutionInternal<false, false>(
    4249        2660 :             args, pChunk, ppDstBuffer, peDstBufferDataType);
    4250             : }
    4251             : 
    4252             : /************************************************************************/
    4253             : /*                       GDALResampleChunkC32R()                        */
    4254             : /************************************************************************/
    4255             : 
    4256           2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
    4257             :                                     const float *pafChunk, const int nChunkYOff,
    4258             :                                     const int nChunkYSize, const int nDstYOff,
    4259             :                                     const int nDstYOff2, const int nOvrXSize,
    4260             :                                     const int nOvrYSize, void **ppDstBuffer,
    4261             :                                     GDALDataType *peDstBufferDataType,
    4262             :                                     const char *pszResampling)
    4263             : 
    4264             : {
    4265             :     enum Method
    4266             :     {
    4267             :         NEAR,
    4268             :         AVERAGE,
    4269             :         AVERAGE_MAGPHASE,
    4270             :         RMS,
    4271             :     };
    4272             : 
    4273           2 :     Method eMethod = NEAR;
    4274           2 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4275             :     {
    4276           0 :         eMethod = NEAR;
    4277             :     }
    4278           2 :     else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
    4279             :     {
    4280           0 :         eMethod = AVERAGE_MAGPHASE;
    4281             :     }
    4282           2 :     else if (EQUAL(pszResampling, "RMS"))
    4283             :     {
    4284           2 :         eMethod = RMS;
    4285             :     }
    4286           0 :     else if (STARTS_WITH_CI(pszResampling, "AVER"))
    4287             :     {
    4288           0 :         eMethod = AVERAGE;
    4289             :     }
    4290             :     else
    4291             :     {
    4292           0 :         CPLError(
    4293             :             CE_Failure, CPLE_NotSupported,
    4294             :             "Resampling method %s is not supported for complex data types. "
    4295             :             "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
    4296             :             pszResampling);
    4297           0 :         return CE_Failure;
    4298             :     }
    4299             : 
    4300           2 :     const int nOXSize = nOvrXSize;
    4301           2 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
    4302             :                                        GDALGetDataTypeSizeBytes(GDT_CFloat32));
    4303           2 :     if (*ppDstBuffer == nullptr)
    4304             :     {
    4305           0 :         return CE_Failure;
    4306             :     }
    4307           2 :     float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
    4308           2 :     *peDstBufferDataType = GDT_CFloat32;
    4309             : 
    4310           2 :     const int nOYSize = nOvrYSize;
    4311           2 :     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
    4312           2 :     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
    4313             : 
    4314             :     /* ==================================================================== */
    4315             :     /*      Loop over destination scanlines.                                */
    4316             :     /* ==================================================================== */
    4317           8 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    4318             :     {
    4319           6 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    4320           6 :         if (nSrcYOff < nChunkYOff)
    4321           0 :             nSrcYOff = nChunkYOff;
    4322             : 
    4323           6 :         int nSrcYOff2 =
    4324           6 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
    4325           6 :         if (nSrcYOff2 == nSrcYOff)
    4326           0 :             nSrcYOff2++;
    4327             : 
    4328           6 :         if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
    4329             :         {
    4330           2 :             if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
    4331           0 :                 nSrcYOff = nSrcHeight - 1;
    4332           2 :             nSrcYOff2 = nSrcHeight;
    4333             :         }
    4334           6 :         if (nSrcYOff2 > nChunkYOff + nChunkYSize)
    4335           0 :             nSrcYOff2 = nChunkYOff + nChunkYSize;
    4336             : 
    4337           6 :         const float *const pafSrcScanline =
    4338           6 :             pafChunk +
    4339           6 :             (static_cast<size_t>(nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
    4340           6 :         float *const pafDstScanline =
    4341           6 :             pafDstBuffer +
    4342           6 :             static_cast<size_t>(iDstLine - nDstYOff) * 2 * nOXSize;
    4343             : 
    4344             :         /* --------------------------------------------------------------------
    4345             :          */
    4346             :         /*      Loop over destination pixels */
    4347             :         /* --------------------------------------------------------------------
    4348             :          */
    4349          18 :         for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
    4350             :         {
    4351          12 :             const size_t iDstPixelSZ = static_cast<size_t>(iDstPixel);
    4352          12 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    4353          12 :             int nSrcXOff2 =
    4354          12 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
    4355          12 :             if (nSrcXOff2 == nSrcXOff)
    4356           0 :                 nSrcXOff2++;
    4357          12 :             if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
    4358             :             {
    4359           6 :                 if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
    4360           0 :                     nSrcXOff = nSrcWidth - 1;
    4361           6 :                 nSrcXOff2 = nSrcWidth;
    4362             :             }
    4363          12 :             const size_t nSrcXOffSZ = static_cast<size_t>(nSrcXOff);
    4364             : 
    4365          12 :             if (eMethod == NEAR)
    4366             :             {
    4367           0 :                 pafDstScanline[iDstPixelSZ * 2] =
    4368           0 :                     pafSrcScanline[nSrcXOffSZ * 2];
    4369           0 :                 pafDstScanline[iDstPixelSZ * 2 + 1] =
    4370           0 :                     pafSrcScanline[nSrcXOffSZ * 2 + 1];
    4371             :             }
    4372          12 :             else if (eMethod == AVERAGE_MAGPHASE)
    4373             :             {
    4374           0 :                 double dfTotalR = 0.0;
    4375           0 :                 double dfTotalI = 0.0;
    4376           0 :                 double dfTotalM = 0.0;
    4377           0 :                 size_t nCount = 0;
    4378             : 
    4379           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4380             :                 {
    4381           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4382             :                     {
    4383           0 :                         const double dfR = double(
    4384           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4385           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4386           0 :                                                nSrcWidth * 2]);
    4387           0 :                         const double dfI = double(
    4388           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4389           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4390           0 :                                                nSrcWidth * 2 +
    4391           0 :                                            1]);
    4392           0 :                         dfTotalR += dfR;
    4393           0 :                         dfTotalI += dfI;
    4394           0 :                         dfTotalM += std::hypot(dfR, dfI);
    4395           0 :                         ++nCount;
    4396             :                     }
    4397             :                 }
    4398             : 
    4399           0 :                 CPLAssert(nCount > 0);
    4400           0 :                 if (nCount == 0)
    4401             :                 {
    4402           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4403           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4404             :                 }
    4405             :                 else
    4406             :                 {
    4407           0 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4408           0 :                         dfTotalR / static_cast<double>(nCount));
    4409           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4410           0 :                         dfTotalI / static_cast<double>(nCount));
    4411             :                     const double dfM =
    4412           0 :                         double(std::hypot(pafDstScanline[iDstPixelSZ * 2],
    4413           0 :                                           pafDstScanline[iDstPixelSZ * 2 + 1]));
    4414           0 :                     const double dfDesiredM =
    4415           0 :                         dfTotalM / static_cast<double>(nCount);
    4416           0 :                     double dfRatio = 1.0;
    4417           0 :                     if (dfM != 0.0)
    4418           0 :                         dfRatio = dfDesiredM / dfM;
    4419             : 
    4420           0 :                     pafDstScanline[iDstPixelSZ * 2] *=
    4421           0 :                         static_cast<float>(dfRatio);
    4422           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] *=
    4423           0 :                         static_cast<float>(dfRatio);
    4424             :                 }
    4425             :             }
    4426          12 :             else if (eMethod == RMS)
    4427             :             {
    4428          12 :                 double dfTotalR = 0.0;
    4429          12 :                 double dfTotalI = 0.0;
    4430          12 :                 size_t nCount = 0;
    4431             : 
    4432          36 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4433             :                 {
    4434          72 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4435             :                     {
    4436          48 :                         const double dfR = double(
    4437          48 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4438          48 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4439          48 :                                                nSrcWidth * 2]);
    4440          48 :                         const double dfI = double(
    4441          48 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4442          48 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4443          48 :                                                nSrcWidth * 2 +
    4444          48 :                                            1]);
    4445             : 
    4446          48 :                         dfTotalR += SQUARE(dfR);
    4447          48 :                         dfTotalI += SQUARE(dfI);
    4448             : 
    4449          48 :                         ++nCount;
    4450             :                     }
    4451             :                 }
    4452             : 
    4453          12 :                 CPLAssert(nCount > 0);
    4454          12 :                 if (nCount == 0)
    4455             :                 {
    4456           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4457           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4458             :                 }
    4459             :                 else
    4460             :                 {
    4461             :                     /* compute RMS */
    4462          12 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4463          12 :                         sqrt(dfTotalR / static_cast<double>(nCount)));
    4464          12 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4465          12 :                         sqrt(dfTotalI / static_cast<double>(nCount)));
    4466             :                 }
    4467             :             }
    4468           0 :             else if (eMethod == AVERAGE)
    4469             :             {
    4470           0 :                 double dfTotalR = 0.0;
    4471           0 :                 double dfTotalI = 0.0;
    4472           0 :                 size_t nCount = 0;
    4473             : 
    4474           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4475             :                 {
    4476           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4477             :                     {
    4478             :                         // TODO(schwehr): Maybe use std::complex?
    4479           0 :                         dfTotalR += double(
    4480           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4481           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4482           0 :                                                nSrcWidth * 2]);
    4483           0 :                         dfTotalI += double(
    4484           0 :                             pafSrcScanline[static_cast<size_t>(iX) * 2 +
    4485           0 :                                            static_cast<size_t>(iY - nSrcYOff) *
    4486           0 :                                                nSrcWidth * 2 +
    4487           0 :                                            1]);
    4488           0 :                         ++nCount;
    4489             :                     }
    4490             :                 }
    4491             : 
    4492           0 :                 CPLAssert(nCount > 0);
    4493           0 :                 if (nCount == 0)
    4494             :                 {
    4495           0 :                     pafDstScanline[iDstPixelSZ * 2] = 0.0;
    4496           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = 0.0;
    4497             :                 }
    4498             :                 else
    4499             :                 {
    4500           0 :                     pafDstScanline[iDstPixelSZ * 2] = static_cast<float>(
    4501           0 :                         dfTotalR / static_cast<double>(nCount));
    4502           0 :                     pafDstScanline[iDstPixelSZ * 2 + 1] = static_cast<float>(
    4503           0 :                         dfTotalI / static_cast<double>(nCount));
    4504             :                 }
    4505             :             }
    4506             :         }
    4507             :     }
    4508             : 
    4509           2 :     return CE_None;
    4510             : }
    4511             : 
    4512             : /************************************************************************/
    4513             : /*                  GDALRegenerateCascadingOverviews()                  */
    4514             : /*                                                                      */
    4515             : /*      Generate a list of overviews in order from largest to           */
    4516             : /*      smallest, computing each from the next larger.                  */
    4517             : /************************************************************************/
    4518             : 
    4519          44 : static CPLErr GDALRegenerateCascadingOverviews(
    4520             :     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
    4521             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    4522             :     void *pProgressData, CSLConstList papszOptions)
    4523             : 
    4524             : {
    4525             :     /* -------------------------------------------------------------------- */
    4526             :     /*      First, we must put the overviews in order from largest to       */
    4527             :     /*      smallest.                                                       */
    4528             :     /* -------------------------------------------------------------------- */
    4529         127 :     for (int i = 0; i < nOverviews - 1; ++i)
    4530             :     {
    4531         292 :         for (int j = 0; j < nOverviews - i - 1; ++j)
    4532             :         {
    4533         209 :             if (papoOvrBands[j]->GetXSize() *
    4534         209 :                     static_cast<float>(papoOvrBands[j]->GetYSize()) <
    4535         209 :                 papoOvrBands[j + 1]->GetXSize() *
    4536         209 :                     static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
    4537             :             {
    4538           0 :                 GDALRasterBand *poTempBand = papoOvrBands[j];
    4539           0 :                 papoOvrBands[j] = papoOvrBands[j + 1];
    4540           0 :                 papoOvrBands[j + 1] = poTempBand;
    4541             :             }
    4542             :         }
    4543             :     }
    4544             : 
    4545             :     /* -------------------------------------------------------------------- */
    4546             :     /*      Count total pixels so we can prepare appropriate scaled         */
    4547             :     /*      progress functions.                                             */
    4548             :     /* -------------------------------------------------------------------- */
    4549          44 :     double dfTotalPixels = 0.0;
    4550             : 
    4551         171 :     for (int i = 0; i < nOverviews; ++i)
    4552             :     {
    4553         127 :         dfTotalPixels += papoOvrBands[i]->GetXSize() *
    4554         127 :                          static_cast<double>(papoOvrBands[i]->GetYSize());
    4555             :     }
    4556             : 
    4557             :     /* -------------------------------------------------------------------- */
    4558             :     /*      Generate all the bands.                                         */
    4559             :     /* -------------------------------------------------------------------- */
    4560          44 :     double dfPixelsProcessed = 0.0;
    4561             : 
    4562         171 :     for (int i = 0; i < nOverviews; ++i)
    4563             :     {
    4564         127 :         GDALRasterBand *poBaseBand = poSrcBand;
    4565         127 :         if (i != 0)
    4566          83 :             poBaseBand = papoOvrBands[i - 1];
    4567             : 
    4568         127 :         double dfPixels = papoOvrBands[i]->GetXSize() *
    4569         127 :                           static_cast<double>(papoOvrBands[i]->GetYSize());
    4570             : 
    4571         254 :         void *pScaledProgressData = GDALCreateScaledProgress(
    4572             :             dfPixelsProcessed / dfTotalPixels,
    4573         127 :             (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
    4574             :             pProgressData);
    4575             : 
    4576         254 :         const CPLErr eErr = GDALRegenerateOverviewsEx(
    4577             :             poBaseBand, 1,
    4578         127 :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
    4579             :             pszResampling, GDALScaledProgress, pScaledProgressData,
    4580             :             papszOptions);
    4581         127 :         GDALDestroyScaledProgress(pScaledProgressData);
    4582             : 
    4583         127 :         if (eErr != CE_None)
    4584           0 :             return eErr;
    4585             : 
    4586         127 :         dfPixelsProcessed += dfPixels;
    4587             : 
    4588             :         // Only do the bit2grayscale promotion on the base band.
    4589         127 :         if (STARTS_WITH_CI(pszResampling,
    4590             :                            "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
    4591           8 :             pszResampling = "AVERAGE";
    4592             :     }
    4593             : 
    4594          44 :     return CE_None;
    4595             : }
    4596             : 
    4597             : /************************************************************************/
    4598             : /*                    GDALGetResampleFunction()                         */
    4599             : /************************************************************************/
    4600             : 
    4601        5409 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
    4602             :                                              int *pnRadius)
    4603             : {
    4604        5409 :     if (pnRadius)
    4605        5407 :         *pnRadius = 0;
    4606        5409 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4607         512 :         return GDALResampleChunk_Near;
    4608        4897 :     else if (STARTS_WITH_CI(pszResampling, "AVER") ||
    4609        4324 :              EQUAL(pszResampling, "RMS"))
    4610         634 :         return GDALResampleChunk_AverageOrRMS;
    4611        4263 :     else if (EQUAL(pszResampling, "GAUSS"))
    4612             :     {
    4613          26 :         if (pnRadius)
    4614          26 :             *pnRadius = 1;
    4615          26 :         return GDALResampleChunk_Gauss;
    4616             :     }
    4617        4237 :     else if (EQUAL(pszResampling, "MODE"))
    4618         136 :         return GDALResampleChunk_Mode;
    4619        4101 :     else if (EQUAL(pszResampling, "CUBIC"))
    4620             :     {
    4621        1593 :         if (pnRadius)
    4622        1593 :             *pnRadius = GWKGetFilterRadius(GRA_Cubic);
    4623        1585 :         return GDALResampleChunk_Convolution;
    4624             :     }
    4625        2508 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    4626             :     {
    4627          39 :         if (pnRadius)
    4628          39 :             *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
    4629          39 :         return GDALResampleChunk_Convolution;
    4630             :     }
    4631        2469 :     else if (EQUAL(pszResampling, "LANCZOS"))
    4632             :     {
    4633          44 :         if (pnRadius)
    4634          44 :             *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
    4635          44 :         return GDALResampleChunk_Convolution;
    4636             :     }
    4637        2425 :     else if (EQUAL(pszResampling, "BILINEAR"))
    4638             :     {
    4639        2430 :         if (pnRadius)
    4640        2430 :             *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
    4641        2430 :         return GDALResampleChunk_Convolution;
    4642             :     }
    4643             :     else
    4644             :     {
    4645           0 :         CPLError(
    4646             :             CE_Failure, CPLE_AppDefined,
    4647             :             "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
    4648             :             pszResampling);
    4649           0 :         return nullptr;
    4650             :     }
    4651             : }
    4652             : 
    4653             : /************************************************************************/
    4654             : /*                      GDALGetOvrWorkDataType()                        */
    4655             : /************************************************************************/
    4656             : 
    4657        5284 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
    4658             :                                     GDALDataType eSrcDataType)
    4659             : {
    4660        5284 :     if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
    4661             :     {
    4662         633 :         return eSrcDataType;
    4663             :     }
    4664        4651 :     else if (eSrcDataType == GDT_Byte &&
    4665        4121 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4666        3642 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4667        2279 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4668        2274 :               EQUAL(pszResampling, "LANCZOS") ||
    4669        2267 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4670             :     {
    4671        4113 :         return GDT_Byte;
    4672             :     }
    4673         538 :     else if (eSrcDataType == GDT_UInt16 &&
    4674         128 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4675         123 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4676           8 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4677           6 :               EQUAL(pszResampling, "LANCZOS") ||
    4678           3 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4679             :     {
    4680         126 :         return GDT_UInt16;
    4681             :     }
    4682         412 :     else if (EQUAL(pszResampling, "GAUSS"))
    4683          20 :         return GDT_Float64;
    4684             : 
    4685         392 :     if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
    4686         388 :         eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
    4687             :         eSrcDataType == GDT_Float32)
    4688             :     {
    4689         258 :         return GDT_Float32;
    4690             :     }
    4691         134 :     return GDT_Float64;
    4692             : }
    4693             : 
    4694             : namespace
    4695             : {
    4696             : // Structure to hold a pointer to free with CPLFree()
    4697             : struct PointerHolder
    4698             : {
    4699             :     void *ptr = nullptr;
    4700             : 
    4701        5840 :     explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
    4702             :     {
    4703        5840 :     }
    4704             : 
    4705        5840 :     ~PointerHolder()
    4706        5840 :     {
    4707        5840 :         CPLFree(ptr);
    4708        5840 :     }
    4709             : 
    4710             :     PointerHolder(const PointerHolder &) = delete;
    4711             :     PointerHolder &operator=(const PointerHolder &) = delete;
    4712             : };
    4713             : }  // namespace
    4714             : 
    4715             : /************************************************************************/
    4716             : /*                      GDALRegenerateOverviews()                       */
    4717             : /************************************************************************/
    4718             : 
    4719             : /**
    4720             :  * \brief Generate downsampled overviews.
    4721             :  *
    4722             :  * This function will generate one or more overview images from a base image
    4723             :  * using the requested downsampling algorithm.  Its primary use is for
    4724             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4725             :  * used to generate downsampled images in one file from another outside the
    4726             :  * overview architecture.
    4727             :  *
    4728             :  * The output bands need to exist in advance.
    4729             :  *
    4730             :  * The full set of resampling algorithms is documented in
    4731             :  * GDALDataset::BuildOverviews().
    4732             :  *
    4733             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4734             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4735             :  * considered as the nodata value and not each value of the triplet
    4736             :  * independently per band.
    4737             :  *
    4738             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4739             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4740             :  * overview computation.
    4741             :  *
    4742             :  * @param hSrcBand the source (base level) band.
    4743             :  * @param nOverviewCount the number of downsampled bands being generated.
    4744             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4745             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4746             :  * @param pfnProgress progress report function.
    4747             :  * @param pProgressData progress function callback data.
    4748             :  * @return CE_None on success or CE_Failure on failure.
    4749             :  */
    4750         250 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
    4751             :                                GDALRasterBandH *pahOvrBands,
    4752             :                                const char *pszResampling,
    4753             :                                GDALProgressFunc pfnProgress,
    4754             :                                void *pProgressData)
    4755             : 
    4756             : {
    4757         250 :     return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
    4758             :                                      pszResampling, pfnProgress, pProgressData,
    4759         250 :                                      nullptr);
    4760             : }
    4761             : 
    4762             : /************************************************************************/
    4763             : /*                     GDALRegenerateOverviewsEx()                      */
    4764             : /************************************************************************/
    4765             : 
    4766             : constexpr int RADIUS_TO_DIAMETER = 2;
    4767             : 
    4768             : /**
    4769             :  * \brief Generate downsampled overviews.
    4770             :  *
    4771             :  * This function will generate one or more overview images from a base image
    4772             :  * using the requested downsampling algorithm.  Its primary use is for
    4773             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4774             :  * used to generate downsampled images in one file from another outside the
    4775             :  * overview architecture.
    4776             :  *
    4777             :  * The output bands need to exist in advance.
    4778             :  *
    4779             :  * The full set of resampling algorithms is documented in
    4780             :  * GDALDataset::BuildOverviews().
    4781             :  *
    4782             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4783             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4784             :  * considered as the nodata value and not each value of the triplet
    4785             :  * independently per band.
    4786             :  *
    4787             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4788             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4789             :  * overview computation.
    4790             :  *
    4791             :  * @param hSrcBand the source (base level) band.
    4792             :  * @param nOverviewCount the number of downsampled bands being generated.
    4793             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4794             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4795             :  * @param pfnProgress progress report function.
    4796             :  * @param pProgressData progress function callback data.
    4797             :  * @param papszOptions NULL terminated list of options as key=value pairs, or
    4798             :  * NULL
    4799             :  * @return CE_None on success or CE_Failure on failure.
    4800             :  * @since GDAL 3.6
    4801             :  */
    4802         903 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
    4803             :                                  GDALRasterBandH *pahOvrBands,
    4804             :                                  const char *pszResampling,
    4805             :                                  GDALProgressFunc pfnProgress,
    4806             :                                  void *pProgressData, CSLConstList papszOptions)
    4807             : 
    4808             : {
    4809         903 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4810         903 :     GDALRasterBand **papoOvrBands =
    4811             :         reinterpret_cast<GDALRasterBand **>(pahOvrBands);
    4812             : 
    4813         903 :     if (pfnProgress == nullptr)
    4814         252 :         pfnProgress = GDALDummyProgress;
    4815             : 
    4816         903 :     if (EQUAL(pszResampling, "NONE"))
    4817          49 :         return CE_None;
    4818             : 
    4819         854 :     int nKernelRadius = 0;
    4820             :     GDALResampleFunction pfnResampleFn =
    4821         854 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4822             : 
    4823         854 :     if (pfnResampleFn == nullptr)
    4824           0 :         return CE_Failure;
    4825             : 
    4826             :     /* -------------------------------------------------------------------- */
    4827             :     /*      Check color tables...                                           */
    4828             :     /* -------------------------------------------------------------------- */
    4829         854 :     GDALColorTable *poColorTable = nullptr;
    4830             : 
    4831         487 :     if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4832        1786 :          EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
    4833         456 :         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4834             :     {
    4835           9 :         poColorTable = poSrcBand->GetColorTable();
    4836           9 :         if (poColorTable != nullptr)
    4837             :         {
    4838           9 :             if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
    4839             :             {
    4840           0 :                 CPLError(CE_Warning, CPLE_AppDefined,
    4841             :                          "Computing overviews on palette index raster bands "
    4842             :                          "with a palette whose color interpretation is not RGB "
    4843             :                          "will probably lead to unexpected results.");
    4844           0 :                 poColorTable = nullptr;
    4845             :             }
    4846           9 :             else if (poColorTable->IsIdentity())
    4847             :             {
    4848           0 :                 poColorTable = nullptr;
    4849             :             }
    4850             :         }
    4851             :         else
    4852             :         {
    4853           0 :             CPLError(CE_Warning, CPLE_AppDefined,
    4854             :                      "Computing overviews on palette index raster bands "
    4855             :                      "without a palette will probably lead to unexpected "
    4856             :                      "results.");
    4857             :         }
    4858             :     }
    4859             :     // Not ready yet
    4860        2481 :     else if ((EQUAL(pszResampling, "CUBIC") ||
    4861         791 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4862         791 :               EQUAL(pszResampling, "LANCZOS") ||
    4863        1716 :               EQUAL(pszResampling, "BILINEAR")) &&
    4864          80 :              poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4865             :     {
    4866           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4867             :                  "Computing %s overviews on palette index raster bands "
    4868             :                  "will probably lead to unexpected results.",
    4869             :                  pszResampling);
    4870             :     }
    4871             : 
    4872             :     // If we have a nodata mask and we are doing something more complicated
    4873             :     // than nearest neighbouring, we have to fetch to nodata mask.
    4874             : 
    4875         854 :     GDALRasterBand *poMaskBand = nullptr;
    4876         854 :     bool bUseNoDataMask = false;
    4877         854 :     bool bCanUseCascaded = true;
    4878             : 
    4879         854 :     if (!STARTS_WITH_CI(pszResampling, "NEAR"))
    4880             :     {
    4881             :         // Special case if we are an alpha/mask band. We want it to be
    4882             :         // considered as the mask band to avoid alpha=0 to be taken into account
    4883             :         // in average computation.
    4884         536 :         if (poSrcBand->IsMaskBand())
    4885             :         {
    4886          91 :             poMaskBand = poSrcBand;
    4887          91 :             bUseNoDataMask = true;
    4888             :         }
    4889             :         else
    4890             :         {
    4891         445 :             poMaskBand = poSrcBand->GetMaskBand();
    4892         445 :             const int nMaskFlags = poSrcBand->GetMaskFlags();
    4893         445 :             bCanUseCascaded =
    4894         445 :                 (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
    4895         445 :             bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
    4896             :         }
    4897             :     }
    4898             : 
    4899             :     /* -------------------------------------------------------------------- */
    4900             :     /*      If we are operating on multiple overviews, and using            */
    4901             :     /*      averaging, lets do them in cascading order to reduce the        */
    4902             :     /*      amount of computation.                                          */
    4903             :     /* -------------------------------------------------------------------- */
    4904             : 
    4905             :     // In case the mask made be computed from another band of the dataset,
    4906             :     // we can't use cascaded generation, as the computation of the overviews
    4907             :     // of the band used for the mask band may not have yet occurred (#3033).
    4908         854 :     if ((STARTS_WITH_CI(pszResampling, "AVER") ||
    4909         487 :          EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
    4910         456 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    4911         402 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    4912         854 :          EQUAL(pszResampling, "MODE")) &&
    4913          44 :         nOverviewCount > 1 && bCanUseCascaded)
    4914          44 :         return GDALRegenerateCascadingOverviews(
    4915             :             poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
    4916          44 :             pProgressData, papszOptions);
    4917             : 
    4918             :     /* -------------------------------------------------------------------- */
    4919             :     /*      Setup one horizontal swath to read from the raw buffer.         */
    4920             :     /* -------------------------------------------------------------------- */
    4921         810 :     int nFRXBlockSize = 0;
    4922         810 :     int nFRYBlockSize = 0;
    4923         810 :     poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
    4924             : 
    4925         810 :     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
    4926        1302 :     const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
    4927        1252 :                                        EQUAL(pszResampling, "MODE") ||
    4928         442 :                                        !GDALDataTypeIsComplex(eSrcDataType);
    4929             :     const GDALDataType eWrkDataType =
    4930             :         bUseGenericResampleFn
    4931         810 :             ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
    4932         810 :             : GDT_CFloat32;
    4933             : 
    4934         810 :     const int nWidth = poSrcBand->GetXSize();
    4935         810 :     const int nHeight = poSrcBand->GetYSize();
    4936             : 
    4937         810 :     int nMaxOvrFactor = 1;
    4938        1737 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    4939             :     {
    4940         927 :         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
    4941         927 :         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
    4942         927 :         nMaxOvrFactor = std::max(
    4943             :             nMaxOvrFactor,
    4944         927 :             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
    4945         927 :         nMaxOvrFactor = std::max(
    4946             :             nMaxOvrFactor,
    4947         927 :             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
    4948             :     }
    4949             : 
    4950         810 :     int nFullResYChunk = nFRYBlockSize;
    4951         810 :     int nMaxChunkYSizeQueried = 0;
    4952             : 
    4953             :     const auto UpdateChunkHeightAndGetChunkSize =
    4954       10629 :         [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
    4955       85993 :          eWrkDataType, nWidth]()
    4956             :     {
    4957             :         // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
    4958             :         // + nFullResYChunk) / nMaxOvrFactor)
    4959       10629 :         if (nMaxOvrFactor > INT_MAX / RADIUS_TO_DIAMETER)
    4960             :         {
    4961           1 :             return GINTBIG_MAX;
    4962             :         }
    4963       10628 :         nFullResYChunk =
    4964       10628 :             std::max(nFullResYChunk, RADIUS_TO_DIAMETER * nMaxOvrFactor);
    4965       10628 :         if ((nKernelRadius > 0 &&
    4966         970 :              nMaxOvrFactor > INT_MAX / (RADIUS_TO_DIAMETER * nKernelRadius)) ||
    4967       10628 :             nFullResYChunk >
    4968       10628 :                 INT_MAX - RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor)
    4969             :         {
    4970           0 :             return GINTBIG_MAX;
    4971             :         }
    4972       10628 :         nMaxChunkYSizeQueried =
    4973       10628 :             nFullResYChunk + RADIUS_TO_DIAMETER * nKernelRadius * nMaxOvrFactor;
    4974       10628 :         if (GDALGetDataTypeSizeBytes(eWrkDataType) >
    4975       10628 :             std::numeric_limits<int64_t>::max() /
    4976       10628 :                 (static_cast<int64_t>(nMaxChunkYSizeQueried) * nWidth))
    4977             :         {
    4978           1 :             return GINTBIG_MAX;
    4979             :         }
    4980       10627 :         return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
    4981       10627 :                nMaxChunkYSizeQueried * nWidth;
    4982         810 :     };
    4983             : 
    4984             :     const char *pszChunkYSize =
    4985         810 :         CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
    4986             : #ifndef __COVERITY__
    4987             :     // Only configurable for debug / testing
    4988         810 :     if (pszChunkYSize)
    4989             :     {
    4990           0 :         nFullResYChunk = atoi(pszChunkYSize);
    4991             :     }
    4992             : #endif
    4993             : 
    4994             :     // Only configurable for debug / testing
    4995             :     const int nChunkMaxSize =
    4996         810 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    4997             : 
    4998         810 :     auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4999         810 :     if (nChunkSize > nChunkMaxSize)
    5000             :     {
    5001          15 :         if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
    5002          44 :             !GDALDataTypeIsComplex(eSrcDataType) &&
    5003          14 :             (!STARTS_WITH_CI(pszResampling, "AVER") ||
    5004           2 :              EQUAL(pszResampling, "AVERAGE")))
    5005             :         {
    5006             :             // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
    5007             :             // which use a block based strategy, which is much less memory
    5008             :             // hungry.
    5009          14 :             return GDALRegenerateOverviewsMultiBand(
    5010             :                 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
    5011          14 :                 pfnProgress, pProgressData, papszOptions);
    5012             :         }
    5013           1 :         else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
    5014             :         {
    5015           0 :             return GDALRegenerateCascadingOverviews(
    5016             :                 poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
    5017           0 :                 pfnProgress, pProgressData, papszOptions);
    5018             :         }
    5019             :     }
    5020         795 :     else if (pszChunkYSize == nullptr)
    5021             :     {
    5022             :         // Try to get as close as possible to nChunkMaxSize
    5023       10614 :         while (nChunkSize < nChunkMaxSize / 2)
    5024             :         {
    5025        9819 :             nFullResYChunk *= 2;
    5026        9819 :             nChunkSize = UpdateChunkHeightAndGetChunkSize();
    5027             :         }
    5028             :     }
    5029             : 
    5030         796 :     int nHasNoData = 0;
    5031         796 :     const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
    5032         796 :     const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    5033             :     const bool bPropagateNoData =
    5034         796 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5035             : 
    5036             :     // Structure describing a resampling job
    5037             :     struct OvrJob
    5038             :     {
    5039             :         // Buffers to free when job is finished
    5040             :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
    5041             :         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
    5042             :         std::unique_ptr<PointerHolder> oDstBufferHolder{};
    5043             : 
    5044             :         GDALRasterBand *poDstBand = nullptr;
    5045             : 
    5046             :         // Input parameters of pfnResampleFn
    5047             :         GDALResampleFunction pfnResampleFn = nullptr;
    5048             :         int nSrcWidth = 0;
    5049             :         int nSrcHeight = 0;
    5050             :         int nDstWidth = 0;
    5051             :         GDALOverviewResampleArgs args{};
    5052             :         const void *pChunk = nullptr;
    5053             :         bool bUseGenericResampleFn = false;
    5054             : 
    5055             :         // Output values of resampling function
    5056             :         CPLErr eErr = CE_Failure;
    5057             :         void *pDstBuffer = nullptr;
    5058             :         GDALDataType eDstBufferDataType = GDT_Unknown;
    5059             : 
    5060           0 :         void SetSrcMaskBufferHolder(
    5061             :             const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
    5062             :         {
    5063           0 :             oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
    5064           0 :         }
    5065             : 
    5066           0 :         void SetSrcBufferHolder(
    5067             :             const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
    5068             :         {
    5069           0 :             oSrcBufferHolder = oSrcBufferHolderIn;
    5070           0 :         }
    5071             : 
    5072         896 :         void NotifyFinished()
    5073             :         {
    5074        1792 :             std::lock_guard guard(mutex);
    5075         896 :             bFinished = true;
    5076         896 :             cv.notify_one();
    5077         896 :         }
    5078             : 
    5079           0 :         bool IsFinished()
    5080             :         {
    5081           0 :             std::lock_guard guard(mutex);
    5082           0 :             return bFinished;
    5083             :         }
    5084             : 
    5085           0 :         void WaitFinished()
    5086             :         {
    5087           0 :             std::unique_lock oGuard(mutex);
    5088           0 :             while (!bFinished)
    5089             :             {
    5090           0 :                 cv.wait(oGuard);
    5091             :             }
    5092           0 :         }
    5093             : 
    5094             :       private:
    5095             :         // Synchronization
    5096             :         bool bFinished = false;
    5097             :         std::mutex mutex{};
    5098             :         std::condition_variable cv{};
    5099             :     };
    5100             : 
    5101             :     // Thread function to resample
    5102         896 :     const auto JobResampleFunc = [](void *pData)
    5103             :     {
    5104         896 :         OvrJob *poJob = static_cast<OvrJob *>(pData);
    5105             : 
    5106         896 :         if (poJob->bUseGenericResampleFn)
    5107             :         {
    5108         894 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    5109             :                                                &(poJob->pDstBuffer),
    5110             :                                                &(poJob->eDstBufferDataType));
    5111             :         }
    5112             :         else
    5113             :         {
    5114           2 :             poJob->eErr = GDALResampleChunkC32R(
    5115             :                 poJob->nSrcWidth, poJob->nSrcHeight,
    5116           2 :                 static_cast<const float *>(poJob->pChunk),
    5117             :                 poJob->args.nChunkYOff, poJob->args.nChunkYSize,
    5118             :                 poJob->args.nDstYOff, poJob->args.nDstYOff2,
    5119             :                 poJob->args.nOvrXSize, poJob->args.nOvrYSize,
    5120             :                 &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
    5121             :                 poJob->args.pszResampling);
    5122             :         }
    5123             : 
    5124             :         poJob->oDstBufferHolder =
    5125         896 :             std::make_unique<PointerHolder>(poJob->pDstBuffer);
    5126             : 
    5127         896 :         poJob->NotifyFinished();
    5128         896 :     };
    5129             : 
    5130             :     // Function to write resample data to target band
    5131         896 :     const auto WriteJobData = [](const OvrJob *poJob)
    5132             :     {
    5133        1792 :         return poJob->poDstBand->RasterIO(
    5134         896 :             GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
    5135         896 :             poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    5136         896 :             poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    5137         896 :             poJob->eDstBufferDataType, 0, 0, nullptr);
    5138             :     };
    5139             : 
    5140             :     // Wait for completion of oldest job and serialize it
    5141             :     const auto WaitAndFinalizeOldestJob =
    5142           0 :         [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    5143             :     {
    5144           0 :         auto poOldestJob = jobList.front().get();
    5145           0 :         poOldestJob->WaitFinished();
    5146           0 :         CPLErr l_eErr = poOldestJob->eErr;
    5147           0 :         if (l_eErr == CE_None)
    5148             :         {
    5149           0 :             l_eErr = WriteJobData(poOldestJob);
    5150             :         }
    5151             : 
    5152           0 :         jobList.pop_front();
    5153           0 :         return l_eErr;
    5154             :     };
    5155             : 
    5156             :     // Queue of jobs
    5157        1592 :     std::list<std::unique_ptr<OvrJob>> jobList;
    5158             : 
    5159         796 :     GByte *pabyChunkNodataMask = nullptr;
    5160         796 :     void *pChunk = nullptr;
    5161             : 
    5162         796 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    5163        3184 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    5164         796 :                                                        ? CPLGetNumCPUs()
    5165         796 :                                                        : atoi(pszThreads)));
    5166             :     auto poThreadPool =
    5167         796 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5168             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5169        1592 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5170             : 
    5171             :     /* -------------------------------------------------------------------- */
    5172             :     /*      Loop over image operating on chunks.                            */
    5173             :     /* -------------------------------------------------------------------- */
    5174         796 :     int nChunkYOff = 0;
    5175         796 :     CPLErr eErr = CE_None;
    5176             : 
    5177        1597 :     for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
    5178         801 :          nChunkYOff += nFullResYChunk)
    5179             :     {
    5180         801 :         if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
    5181             :                          pProgressData))
    5182             :         {
    5183           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5184           0 :             eErr = CE_Failure;
    5185             :         }
    5186             : 
    5187         801 :         if (nFullResYChunk + nChunkYOff > nHeight)
    5188         793 :             nFullResYChunk = nHeight - nChunkYOff;
    5189             : 
    5190         801 :         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
    5191         801 :         int nChunkYSizeQueried =
    5192         801 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    5193         801 :         if (nChunkYOffQueried < 0)
    5194             :         {
    5195          83 :             nChunkYSizeQueried += nChunkYOffQueried;
    5196          83 :             nChunkYOffQueried = 0;
    5197             :         }
    5198         801 :         if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
    5199          83 :             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
    5200             : 
    5201             :         // Avoid accumulating too many tasks and exhaust RAM
    5202             :         // Try to complete already finished jobs
    5203         801 :         while (eErr == CE_None && !jobList.empty())
    5204             :         {
    5205           0 :             auto poOldestJob = jobList.front().get();
    5206           0 :             if (!poOldestJob->IsFinished())
    5207           0 :                 break;
    5208           0 :             eErr = poOldestJob->eErr;
    5209           0 :             if (eErr == CE_None)
    5210             :             {
    5211           0 :                 eErr = WriteJobData(poOldestJob);
    5212             :             }
    5213             : 
    5214           0 :             jobList.pop_front();
    5215             :         }
    5216             : 
    5217             :         // And in case we have saturated the number of threads,
    5218             :         // wait for completion of tasks to go below the threshold.
    5219        1602 :         while (eErr == CE_None &&
    5220         801 :                jobList.size() >= static_cast<size_t>(nThreads))
    5221             :         {
    5222           0 :             eErr = WaitAndFinalizeOldestJob(jobList);
    5223             :         }
    5224             : 
    5225             :         // (Re)allocate buffers if needed
    5226         801 :         if (pChunk == nullptr)
    5227             :         {
    5228         796 :             pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    5229             :                                          nMaxChunkYSizeQueried, nWidth);
    5230             :         }
    5231         801 :         if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
    5232             :         {
    5233             :             pabyChunkNodataMask = static_cast<GByte *>(
    5234         283 :                 VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
    5235             :         }
    5236             : 
    5237         801 :         if (pChunk == nullptr ||
    5238         283 :             (bUseNoDataMask && pabyChunkNodataMask == nullptr))
    5239             :         {
    5240           0 :             CPLFree(pChunk);
    5241           0 :             CPLFree(pabyChunkNodataMask);
    5242           0 :             return CE_Failure;
    5243             :         }
    5244             : 
    5245             :         // Read chunk.
    5246         801 :         if (eErr == CE_None)
    5247         801 :             eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    5248             :                                        nChunkYSizeQueried, pChunk, nWidth,
    5249             :                                        nChunkYSizeQueried, eWrkDataType, 0, 0,
    5250             :                                        nullptr);
    5251         801 :         if (eErr == CE_None && bUseNoDataMask)
    5252         283 :             eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    5253             :                                         nChunkYSizeQueried, pabyChunkNodataMask,
    5254             :                                         nWidth, nChunkYSizeQueried, GDT_Byte, 0,
    5255             :                                         0, nullptr);
    5256             : 
    5257             :         // Special case to promote 1bit data to 8bit 0/255 values.
    5258         801 :         if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
    5259             :         {
    5260           9 :             if (eWrkDataType == GDT_Float32)
    5261             :             {
    5262           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    5263           0 :                 for (size_t i = 0;
    5264           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5265             :                 {
    5266           0 :                     if (pafChunk[i] == 1.0f)
    5267           0 :                         pafChunk[i] = 255.0f;
    5268             :                 }
    5269             :             }
    5270           9 :             else if (eWrkDataType == GDT_Byte)
    5271             :             {
    5272           9 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    5273      168417 :                 for (size_t i = 0;
    5274      168417 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5275             :                 {
    5276      168408 :                     if (pabyChunk[i] == 1)
    5277      127437 :                         pabyChunk[i] = 255;
    5278             :                 }
    5279             :             }
    5280           0 :             else if (eWrkDataType == GDT_UInt16)
    5281             :             {
    5282           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    5283           0 :                 for (size_t i = 0;
    5284           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5285             :                 {
    5286           0 :                     if (pasChunk[i] == 1)
    5287           0 :                         pasChunk[i] = 255;
    5288             :                 }
    5289             :             }
    5290           0 :             else if (eWrkDataType == GDT_Float64)
    5291             :             {
    5292           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    5293           0 :                 for (size_t i = 0;
    5294           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5295             :                 {
    5296           0 :                     if (padfChunk[i] == 1.0)
    5297           0 :                         padfChunk[i] = 255.0;
    5298             :                 }
    5299             :             }
    5300             :             else
    5301             :             {
    5302           0 :                 CPLAssert(false);
    5303             :             }
    5304             :         }
    5305         792 :         else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
    5306             :         {
    5307           0 :             if (eWrkDataType == GDT_Float32)
    5308             :             {
    5309           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    5310           0 :                 for (size_t i = 0;
    5311           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5312             :                 {
    5313           0 :                     if (pafChunk[i] == 1.0f)
    5314           0 :                         pafChunk[i] = 0.0f;
    5315           0 :                     else if (pafChunk[i] == 0.0f)
    5316           0 :                         pafChunk[i] = 255.0f;
    5317             :                 }
    5318             :             }
    5319           0 :             else if (eWrkDataType == GDT_Byte)
    5320             :             {
    5321           0 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    5322           0 :                 for (size_t i = 0;
    5323           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5324             :                 {
    5325           0 :                     if (pabyChunk[i] == 1)
    5326           0 :                         pabyChunk[i] = 0;
    5327           0 :                     else if (pabyChunk[i] == 0)
    5328           0 :                         pabyChunk[i] = 255;
    5329             :                 }
    5330             :             }
    5331           0 :             else if (eWrkDataType == GDT_UInt16)
    5332             :             {
    5333           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    5334           0 :                 for (size_t i = 0;
    5335           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5336             :                 {
    5337           0 :                     if (pasChunk[i] == 1)
    5338           0 :                         pasChunk[i] = 0;
    5339           0 :                     else if (pasChunk[i] == 0)
    5340           0 :                         pasChunk[i] = 255;
    5341             :                 }
    5342             :             }
    5343           0 :             else if (eWrkDataType == GDT_Float64)
    5344             :             {
    5345           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    5346           0 :                 for (size_t i = 0;
    5347           0 :                      i < static_cast<size_t>(nChunkYSizeQueried) * nWidth; i++)
    5348             :                 {
    5349           0 :                     if (padfChunk[i] == 1.0)
    5350           0 :                         padfChunk[i] = 0.0;
    5351           0 :                     else if (padfChunk[i] == 0.0)
    5352           0 :                         padfChunk[i] = 255.0;
    5353             :                 }
    5354             :             }
    5355             :             else
    5356             :             {
    5357           0 :                 CPLAssert(false);
    5358             :             }
    5359             :         }
    5360             : 
    5361             :         auto oSrcBufferHolder =
    5362        1602 :             std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
    5363             :         auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
    5364        1602 :             poJobQueue ? pabyChunkNodataMask : nullptr);
    5365             : 
    5366        1697 :         for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
    5367             :              ++iOverview)
    5368             :         {
    5369         896 :             GDALRasterBand *poDstBand = papoOvrBands[iOverview];
    5370         896 :             const int nDstWidth = poDstBand->GetXSize();
    5371         896 :             const int nDstHeight = poDstBand->GetYSize();
    5372             : 
    5373         896 :             const double dfXRatioDstToSrc =
    5374         896 :                 static_cast<double>(nWidth) / nDstWidth;
    5375         896 :             const double dfYRatioDstToSrc =
    5376         896 :                 static_cast<double>(nHeight) / nDstHeight;
    5377             : 
    5378             :             /* --------------------------------------------------------------------
    5379             :              */
    5380             :             /*      Figure out the line to start writing to, and the first line
    5381             :              */
    5382             :             /*      to not write to.  In theory this approach should ensure that
    5383             :              */
    5384             :             /*      every output line will be written if all input chunks are */
    5385             :             /*      processed. */
    5386             :             /* --------------------------------------------------------------------
    5387             :              */
    5388         896 :             int nDstYOff =
    5389         896 :                 static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
    5390         896 :             if (nDstYOff == nDstHeight)
    5391           0 :                 continue;
    5392         896 :             int nDstYOff2 = static_cast<int>(
    5393         896 :                 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
    5394             : 
    5395         896 :             if (nChunkYOff + nFullResYChunk == nHeight)
    5396         889 :                 nDstYOff2 = nDstHeight;
    5397             : #if DEBUG_VERBOSE
    5398             :             CPLDebug("GDAL",
    5399             :                      "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
    5400             :                      nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
    5401             :                      nDstWidth, nDstYOff2 - nDstYOff);
    5402             : #endif
    5403             : 
    5404        1792 :             auto poJob = std::make_unique<OvrJob>();
    5405         896 :             poJob->pfnResampleFn = pfnResampleFn;
    5406         896 :             poJob->bUseGenericResampleFn = bUseGenericResampleFn;
    5407         896 :             poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
    5408         896 :             poJob->args.nOvrXSize = poDstBand->GetXSize();
    5409         896 :             poJob->args.nOvrYSize = poDstBand->GetYSize();
    5410             :             const char *pszNBITS =
    5411         896 :                 poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    5412         896 :             poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    5413         896 :             poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    5414         896 :             poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    5415         896 :             poJob->args.eWrkDataType = eWrkDataType;
    5416         896 :             poJob->pChunk = pChunk;
    5417         896 :             poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
    5418         896 :             poJob->nSrcWidth = nWidth;
    5419         896 :             poJob->nSrcHeight = nHeight;
    5420         896 :             poJob->args.nChunkXOff = 0;
    5421         896 :             poJob->args.nChunkXSize = nWidth;
    5422         896 :             poJob->args.nChunkYOff = nChunkYOffQueried;
    5423         896 :             poJob->args.nChunkYSize = nChunkYSizeQueried;
    5424         896 :             poJob->nDstWidth = nDstWidth;
    5425         896 :             poJob->args.nDstXOff = 0;
    5426         896 :             poJob->args.nDstXOff2 = nDstWidth;
    5427         896 :             poJob->args.nDstYOff = nDstYOff;
    5428         896 :             poJob->args.nDstYOff2 = nDstYOff2;
    5429         896 :             poJob->poDstBand = poDstBand;
    5430         896 :             poJob->args.pszResampling = pszResampling;
    5431         896 :             poJob->args.bHasNoData = bHasNoData;
    5432         896 :             poJob->args.dfNoDataValue = dfNoDataValue;
    5433         896 :             poJob->args.poColorTable = poColorTable;
    5434         896 :             poJob->args.eSrcDataType = eSrcDataType;
    5435         896 :             poJob->args.bPropagateNoData = bPropagateNoData;
    5436             : 
    5437         896 :             if (poJobQueue)
    5438             :             {
    5439           0 :                 poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
    5440           0 :                 poJob->SetSrcBufferHolder(oSrcBufferHolder);
    5441           0 :                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    5442           0 :                 jobList.emplace_back(std::move(poJob));
    5443             :             }
    5444             :             else
    5445             :             {
    5446         896 :                 JobResampleFunc(poJob.get());
    5447         896 :                 eErr = poJob->eErr;
    5448         896 :                 if (eErr == CE_None)
    5449             :                 {
    5450         896 :                     eErr = WriteJobData(poJob.get());
    5451             :                 }
    5452             :             }
    5453             :         }
    5454             : 
    5455         801 :         if (poJobQueue)
    5456             :         {
    5457           0 :             pChunk = nullptr;
    5458           0 :             pabyChunkNodataMask = nullptr;
    5459             :         }
    5460             :     }
    5461             : 
    5462         796 :     VSIFree(pChunk);
    5463         796 :     VSIFree(pabyChunkNodataMask);
    5464             : 
    5465             :     // Wait for all pending jobs to complete
    5466         796 :     while (!jobList.empty())
    5467             :     {
    5468           0 :         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5469           0 :         if (l_eErr != CE_None && eErr == CE_None)
    5470           0 :             eErr = l_eErr;
    5471             :     }
    5472             : 
    5473             :     /* -------------------------------------------------------------------- */
    5474             :     /*      Renormalized overview mean / stddev if needed.                  */
    5475             :     /* -------------------------------------------------------------------- */
    5476         796 :     if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
    5477             :     {
    5478           0 :         GDALOverviewMagnitudeCorrection(
    5479             :             poSrcBand, nOverviewCount,
    5480             :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
    5481             :             GDALDummyProgress, nullptr);
    5482             :     }
    5483             : 
    5484             :     /* -------------------------------------------------------------------- */
    5485             :     /*      It can be important to flush out data to overviews.             */
    5486             :     /* -------------------------------------------------------------------- */
    5487        1685 :     for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
    5488             :          ++iOverview)
    5489             :     {
    5490         889 :         eErr = papoOvrBands[iOverview]->FlushCache(false);
    5491             :     }
    5492             : 
    5493         796 :     if (eErr == CE_None)
    5494         796 :         pfnProgress(1.0, nullptr, pProgressData);
    5495             : 
    5496         796 :     return eErr;
    5497             : }
    5498             : 
    5499             : /************************************************************************/
    5500             : /*            GDALRegenerateOverviewsMultiBand()                        */
    5501             : /************************************************************************/
    5502             : 
    5503             : /**
    5504             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5505             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5506             :  *
    5507             :  * This function will generate one or more overview images from a base
    5508             :  * image using the requested downsampling algorithm.  Its primary use
    5509             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5510             :  * can also be used to generate downsampled images in one file from another
    5511             :  * outside the overview architecture.
    5512             :  *
    5513             :  * The output bands need to exist in advance and share the same characteristics
    5514             :  * (type, dimensions)
    5515             :  *
    5516             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5517             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5518             :  *
    5519             :  * It does not support color tables or complex data types.
    5520             :  *
    5521             :  * The pseudo-algorithm used by the function is :
    5522             :  *    for each overview
    5523             :  *       iterate on lines of the source by a step of deltay
    5524             :  *           iterate on columns of the source  by a step of deltax
    5525             :  *               read the source data of size deltax * deltay for all the bands
    5526             :  *               generate the corresponding overview block for all the bands
    5527             :  *
    5528             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5529             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5530             :  * considered as the nodata value and not each value of the triplet
    5531             :  * independently per band.
    5532             :  *
    5533             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    5534             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5535             :  * overview computation.
    5536             :  *
    5537             :  * @param nBands the number of bands, size of papoSrcBands and size of
    5538             :  *               first dimension of papapoOverviewBands
    5539             :  * @param papoSrcBands the list of source bands to downsample
    5540             :  * @param nOverviews the number of downsampled overview levels being generated.
    5541             :  * @param papapoOverviewBands bidimension array of bands. First dimension is
    5542             :  *                            indexed by nBands. Second dimension is indexed by
    5543             :  *                            nOverviews.
    5544             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5545             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5546             :  * @param pfnProgress progress report function.
    5547             :  * @param pProgressData progress function callback data.
    5548             :  * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
    5549             :  *                     key=value pairs, or NULL
    5550             :  *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
    5551             :  *                     options can be specified to express that overviews should
    5552             :  *                     be regenerated only in the specified subset of the source
    5553             :  *                     dataset.
    5554             :  * @return CE_None on success or CE_Failure on failure.
    5555             :  */
    5556             : 
    5557         388 : CPLErr GDALRegenerateOverviewsMultiBand(
    5558             :     int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
    5559             :     GDALRasterBand *const *const *papapoOverviewBands,
    5560             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5561             :     void *pProgressData, CSLConstList papszOptions)
    5562             : {
    5563         388 :     CPL_IGNORE_RET_VAL(papszOptions);
    5564             : 
    5565         388 :     if (pfnProgress == nullptr)
    5566          11 :         pfnProgress = GDALDummyProgress;
    5567             : 
    5568         388 :     if (EQUAL(pszResampling, "NONE") || nBands == 0 || nOverviews == 0)
    5569           3 :         return CE_None;
    5570             : 
    5571             :     // Sanity checks.
    5572         385 :     if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
    5573         191 :         !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
    5574          82 :         !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
    5575          22 :         !EQUAL(pszResampling, "CUBICSPLINE") &&
    5576          21 :         !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
    5577           5 :         !EQUAL(pszResampling, "MODE"))
    5578             :     {
    5579           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    5580             :                  "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
    5581             :                  "not supported",
    5582             :                  pszResampling);
    5583           0 :         return CE_Failure;
    5584             :     }
    5585             : 
    5586         385 :     int nKernelRadius = 0;
    5587             :     GDALResampleFunction pfnResampleFn =
    5588         385 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    5589         385 :     if (pfnResampleFn == nullptr)
    5590           0 :         return CE_Failure;
    5591             : 
    5592         385 :     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
    5593         385 :     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
    5594         385 :     if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
    5595           0 :         return CE_None;
    5596         385 :     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
    5597       66232 :     for (int iBand = 1; iBand < nBands; ++iBand)
    5598             :     {
    5599      131694 :         if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
    5600       65847 :             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
    5601             :         {
    5602           0 :             CPLError(
    5603             :                 CE_Failure, CPLE_NotSupported,
    5604             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5605             :                 "have the same dimensions");
    5606           0 :             return CE_Failure;
    5607             :         }
    5608       65847 :         if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
    5609             :         {
    5610           0 :             CPLError(
    5611             :                 CE_Failure, CPLE_NotSupported,
    5612             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5613             :                 "have the same data type");
    5614           0 :             return CE_Failure;
    5615             :         }
    5616             :     }
    5617             : 
    5618        1031 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5619             :     {
    5620         646 :         const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
    5621         646 :         const int nDstWidth = poOvrFirstBand->GetXSize();
    5622         646 :         const int nDstHeight = poOvrFirstBand->GetYSize();
    5623       66759 :         for (int iBand = 1; iBand < nBands; ++iBand)
    5624             :         {
    5625       66113 :             const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
    5626      132226 :             if (poOvrBand->GetXSize() != nDstWidth ||
    5627       66113 :                 poOvrBand->GetYSize() != nDstHeight)
    5628             :             {
    5629           0 :                 CPLError(
    5630             :                     CE_Failure, CPLE_NotSupported,
    5631             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5632             :                     "of the same level must have the same dimensions");
    5633           0 :                 return CE_Failure;
    5634             :             }
    5635       66113 :             if (poOvrBand->GetRasterDataType() != eDataType)
    5636             :             {
    5637           0 :                 CPLError(
    5638             :                     CE_Failure, CPLE_NotSupported,
    5639             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5640             :                     "must have the same data type as the source bands");
    5641           0 :                 return CE_Failure;
    5642             :             }
    5643             :         }
    5644             :     }
    5645             : 
    5646             :     // First pass to compute the total number of pixels to write.
    5647         385 :     double dfTotalPixelCount = 0;
    5648         385 :     const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
    5649         385 :     const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
    5650         385 :     const int nSrcXSize = atoi(CSLFetchNameValueDef(
    5651             :         papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
    5652         385 :     const int nSrcYSize = atoi(CSLFetchNameValueDef(
    5653             :         papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
    5654        1031 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5655             :     {
    5656         646 :         dfTotalPixelCount +=
    5657        1292 :             static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5658         646 :             papapoOverviewBands[0][iOverview]->GetXSize() *
    5659        1292 :             static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5660         646 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5661             :     }
    5662             : 
    5663             :     const GDALDataType eWrkDataType =
    5664         385 :         GDALGetOvrWorkDataType(pszResampling, eDataType);
    5665             :     const int nWrkDataTypeSize =
    5666         385 :         std::max(1, GDALGetDataTypeSizeBytes(eWrkDataType));
    5667             : 
    5668         385 :     const bool bIsMask = papoSrcBands[0]->IsMaskBand();
    5669             : 
    5670             :     // If we have a nodata mask and we are doing something more complicated
    5671             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5672             :     const bool bUseNoDataMask =
    5673         568 :         !STARTS_WITH_CI(pszResampling, "NEAR") &&
    5674         183 :         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
    5675             : 
    5676         770 :     std::vector<bool> abHasNoData(nBands);
    5677         770 :     std::vector<double> adfNoDataValue(nBands);
    5678             : 
    5679       66617 :     for (int iBand = 0; iBand < nBands; ++iBand)
    5680             :     {
    5681       66232 :         int nHasNoData = 0;
    5682      132464 :         adfNoDataValue[iBand] =
    5683       66232 :             papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
    5684       66232 :         abHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
    5685             :     }
    5686             :     const bool bPropagateNoData =
    5687         385 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5688             : 
    5689         385 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    5690        1540 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    5691         385 :                                                        ? CPLGetNumCPUs()
    5692         385 :                                                        : atoi(pszThreads)));
    5693             :     auto poThreadPool =
    5694         385 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5695             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5696         770 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5697             : 
    5698             :     // Only configurable for debug / testing
    5699         385 :     const GIntBig nChunkMaxSize = []() -> GIntBig
    5700             :     {
    5701             :         const char *pszVal =
    5702         385 :             CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", nullptr);
    5703         385 :         if (pszVal)
    5704             :         {
    5705          15 :             GIntBig nRet = 0;
    5706          15 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5707          15 :             return std::max<GIntBig>(100, nRet);
    5708             :         }
    5709         370 :         return 10 * 1024 * 1024;
    5710         385 :     }();
    5711             : 
    5712             :     // Only configurable for debug / testing
    5713         385 :     const GIntBig nChunkMaxSizeForTempFile = []() -> GIntBig
    5714             :     {
    5715         385 :         const char *pszVal = CPLGetConfigOption(
    5716             :             "GDAL_OVR_CHUNK_MAX_SIZE_FOR_TEMP_FILE", nullptr);
    5717         385 :         if (pszVal)
    5718             :         {
    5719          14 :             GIntBig nRet = 0;
    5720          14 :             CPLParseMemorySize(pszVal, &nRet, nullptr);
    5721          14 :             return std::max<GIntBig>(100, nRet);
    5722             :         }
    5723         371 :         const auto nUsableRAM = CPLGetUsablePhysicalRAM();
    5724         371 :         if (nUsableRAM > 0)
    5725         371 :             return nUsableRAM / 10;
    5726             :         // Select a value to be able to at least downsample by 2 for a RGB
    5727             :         // 1024x1024 tiled output: (2 * 1024 + 2) * (2 * 1024 + 2) * 3 = 12 MB
    5728           0 :         return 100 * 1024 * 1024;
    5729         385 :     }();
    5730             : 
    5731             :     // Second pass to do the real job.
    5732         385 :     double dfCurPixelCount = 0;
    5733         385 :     CPLErr eErr = CE_None;
    5734        1025 :     for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
    5735             :          ++iOverview)
    5736             :     {
    5737         645 :         int iSrcOverview = -1;  // -1 means the source bands.
    5738             : 
    5739             :         const int nDstTotalWidth =
    5740         645 :             papapoOverviewBands[0][iOverview]->GetXSize();
    5741             :         const int nDstTotalHeight =
    5742         645 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5743             : 
    5744             :         // Compute the coordinates of the target region to refresh
    5745         645 :         constexpr double EPS = 1e-8;
    5746         645 :         const int nDstXOffStart = static_cast<int>(
    5747         645 :             static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
    5748             :             EPS);
    5749             :         const int nDstXOffEnd =
    5750        1290 :             std::min(static_cast<int>(
    5751         645 :                          std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
    5752         645 :                                        nToplevelSrcWidth * nDstTotalWidth -
    5753             :                                    EPS)),
    5754         645 :                      nDstTotalWidth);
    5755         645 :         const int nDstWidth = nDstXOffEnd - nDstXOffStart;
    5756         645 :         const int nDstYOffStart =
    5757         645 :             static_cast<int>(static_cast<double>(nSrcYOff) /
    5758         645 :                                  nToplevelSrcHeight * nDstTotalHeight +
    5759             :                              EPS);
    5760             :         const int nDstYOffEnd =
    5761        1290 :             std::min(static_cast<int>(
    5762         645 :                          std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
    5763         645 :                                        nToplevelSrcHeight * nDstTotalHeight -
    5764             :                                    EPS)),
    5765         645 :                      nDstTotalHeight);
    5766         645 :         const int nDstHeight = nDstYOffEnd - nDstYOffStart;
    5767             : 
    5768             :         // Try to use previous level of overview as the source to compute
    5769             :         // the next level.
    5770         645 :         int nSrcWidth = nToplevelSrcWidth;
    5771         645 :         int nSrcHeight = nToplevelSrcHeight;
    5772         905 :         if (iOverview > 0 &&
    5773         260 :             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
    5774             :         {
    5775         252 :             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
    5776         252 :             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
    5777         252 :             iSrcOverview = iOverview - 1;
    5778             :         }
    5779             : 
    5780         645 :         const double dfXRatioDstToSrc =
    5781         645 :             static_cast<double>(nSrcWidth) / nDstTotalWidth;
    5782         645 :         const double dfYRatioDstToSrc =
    5783         645 :             static_cast<double>(nSrcHeight) / nDstTotalHeight;
    5784             : 
    5785             :         const int nOvrFactor =
    5786        1935 :             std::max(1, std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    5787         645 :                                  static_cast<int>(0.5 + dfYRatioDstToSrc)));
    5788             : 
    5789         645 :         int nDstChunkXSize = 0;
    5790         645 :         int nDstChunkYSize = 0;
    5791         645 :         papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
    5792             :                                                         &nDstChunkYSize);
    5793             : 
    5794         645 :         constexpr int PIXEL_MARGIN = 2;
    5795             :         // Try to extend the chunk size so that the memory needed to acquire
    5796             :         // source pixels goes up to 10 MB.
    5797             :         // This can help for drivers that support multi-threaded reading
    5798         645 :         const int nFullResYChunk = static_cast<int>(std::min<double>(
    5799         645 :             nSrcHeight, PIXEL_MARGIN + nDstChunkYSize * dfYRatioDstToSrc));
    5800         645 :         const int nFullResYChunkQueried = static_cast<int>(std::min<int64_t>(
    5801        1290 :             nSrcHeight,
    5802        1290 :             nFullResYChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5803         645 :                                  nKernelRadius * nOvrFactor));
    5804         881 :         while (nDstChunkXSize < nDstWidth)
    5805             :         {
    5806         255 :             constexpr int INCREASE_FACTOR = 2;
    5807             : 
    5808         255 :             const int nFullResXChunk = static_cast<int>(std::min<double>(
    5809         510 :                 nSrcWidth, PIXEL_MARGIN + INCREASE_FACTOR * nDstChunkXSize *
    5810         255 :                                               dfXRatioDstToSrc));
    5811             : 
    5812             :             const int nFullResXChunkQueried =
    5813         255 :                 static_cast<int>(std::min<int64_t>(
    5814         510 :                     nSrcWidth,
    5815         510 :                     nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5816         255 :                                          nKernelRadius * nOvrFactor));
    5817             : 
    5818         255 :             if (nBands > nChunkMaxSize / nFullResXChunkQueried /
    5819         255 :                              nFullResYChunkQueried / nWrkDataTypeSize)
    5820             :             {
    5821          19 :                 break;
    5822             :             }
    5823             : 
    5824         236 :             nDstChunkXSize *= INCREASE_FACTOR;
    5825             :         }
    5826         645 :         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
    5827             : 
    5828         645 :         const int nFullResXChunk = static_cast<int>(std::min<double>(
    5829         645 :             nSrcWidth, PIXEL_MARGIN + nDstChunkXSize * dfXRatioDstToSrc));
    5830         645 :         const int nFullResXChunkQueried = static_cast<int>(std::min<int64_t>(
    5831        1290 :             nSrcWidth,
    5832        1290 :             nFullResXChunk + static_cast<int64_t>(RADIUS_TO_DIAMETER) *
    5833         645 :                                  nKernelRadius * nOvrFactor));
    5834             : 
    5835             :         // Make sure that the RAM requirements to acquire the source data does
    5836             :         // not exceed nChunkMaxSizeForTempFile
    5837             :         // If so, reduce the destination chunk size, generate overviews in a
    5838             :         // temporary dataset, and copy that temporary dataset over the target
    5839             :         // overview bands (to avoid issues with lossy compression)
    5840             :         const bool bOverflowFullResXChunkYChunkQueried =
    5841         645 :             nBands > std::numeric_limits<int64_t>::max() /
    5842         645 :                          nFullResXChunkQueried / nFullResYChunkQueried /
    5843         645 :                          nWrkDataTypeSize;
    5844             : 
    5845         645 :         const auto nMemRequirement =
    5846             :             bOverflowFullResXChunkYChunkQueried
    5847         645 :                 ? 0
    5848         641 :                 : static_cast<GIntBig>(nFullResXChunkQueried) *
    5849         641 :                       nFullResYChunkQueried * nBands * nWrkDataTypeSize;
    5850             :         // Use a temporary dataset with a smaller destination chunk size
    5851         645 :         const auto nOverShootFactor =
    5852             :             nMemRequirement / nChunkMaxSizeForTempFile;
    5853             : 
    5854         645 :         constexpr int MIN_OVERSHOOT_FACTOR = 4;
    5855             :         const auto nSqrtOverShootFactor = std::max<GIntBig>(
    5856        1290 :             MIN_OVERSHOOT_FACTOR, static_cast<GIntBig>(std::ceil(std::sqrt(
    5857         645 :                                       static_cast<double>(nOverShootFactor)))));
    5858         645 :         constexpr int DEFAULT_CHUNK_SIZE = 256;
    5859         645 :         constexpr int GTIFF_BLOCK_SIZE_MULTIPLE = 16;
    5860             :         const int nReducedDstChunkXSize =
    5861             :             bOverflowFullResXChunkYChunkQueried
    5862        1286 :                 ? DEFAULT_CHUNK_SIZE
    5863        1286 :                 : std::max(1, static_cast<int>(nDstChunkXSize /
    5864        1286 :                                                nSqrtOverShootFactor) &
    5865         641 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    5866             :         const int nReducedDstChunkYSize =
    5867             :             bOverflowFullResXChunkYChunkQueried
    5868        1286 :                 ? DEFAULT_CHUNK_SIZE
    5869        1286 :                 : std::max(1, static_cast<int>(nDstChunkYSize /
    5870        1286 :                                                nSqrtOverShootFactor) &
    5871         641 :                                   ~(GTIFF_BLOCK_SIZE_MULTIPLE - 1));
    5872             : 
    5873         645 :         if (bOverflowFullResXChunkYChunkQueried ||
    5874             :             nMemRequirement > nChunkMaxSizeForTempFile)
    5875             :         {
    5876             :             const auto nDTSize =
    5877          43 :                 std::max(1, GDALGetDataTypeSizeBytes(eDataType));
    5878             :             const bool bTmpDSMemRequirementOverflow =
    5879          43 :                 nBands > std::numeric_limits<int64_t>::max() / nDstWidth /
    5880          43 :                              nDstHeight / nDTSize;
    5881          43 :             const auto nTmpDSMemRequirement =
    5882             :                 bTmpDSMemRequirementOverflow
    5883          43 :                     ? 0
    5884          41 :                     : static_cast<GIntBig>(nDstWidth) * nDstHeight * nBands *
    5885          41 :                           nDTSize;
    5886             : 
    5887             :             // make sure that one band buffer doesn't overflow size_t
    5888             :             const bool bChunkSizeOverflow =
    5889          43 :                 static_cast<size_t>(nDTSize) >
    5890          43 :                 std::numeric_limits<size_t>::max() / nDstWidth / nDstHeight;
    5891          43 :             const size_t nChunkSize =
    5892             :                 bChunkSizeOverflow
    5893          43 :                     ? 0
    5894          41 :                     : static_cast<size_t>(nDstWidth) * nDstHeight * nDTSize;
    5895             : 
    5896             :             const auto CreateVRT =
    5897          41 :                 [nBands, nSrcWidth, nSrcHeight, nDstTotalWidth, nDstTotalHeight,
    5898             :                  pszResampling, eWrkDataType, papoSrcBands, papapoOverviewBands,
    5899             :                  iSrcOverview, &abHasNoData,
    5900      393585 :                  &adfNoDataValue](int nVRTBlockXSize, int nVRTBlockYSize)
    5901             :             {
    5902             :                 auto poVRTDS = std::make_unique<VRTDataset>(
    5903          41 :                     nDstTotalWidth, nDstTotalHeight, nVRTBlockXSize,
    5904          41 :                     nVRTBlockYSize);
    5905             : 
    5906       65620 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5907             :                 {
    5908      131158 :                     auto poVRTSrc = std::make_unique<VRTSimpleSource>();
    5909       65579 :                     poVRTSrc->SetResampling(pszResampling);
    5910       65579 :                     poVRTDS->AddBand(eWrkDataType);
    5911             :                     auto poVRTBand = static_cast<VRTSourcedRasterBand *>(
    5912       65579 :                         poVRTDS->GetRasterBand(iBand + 1));
    5913             : 
    5914       65579 :                     auto poSrcBand = papoSrcBands[iBand];
    5915       65579 :                     if (iSrcOverview != -1)
    5916          24 :                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
    5917       65579 :                     poVRTBand->ConfigureSource(
    5918             :                         poVRTSrc.get(), poSrcBand, false, 0, 0, nSrcWidth,
    5919             :                         nSrcHeight, 0, 0, nDstTotalWidth, nDstTotalHeight);
    5920             :                     // Add the source to the band
    5921       65579 :                     poVRTBand->AddSource(poVRTSrc.release());
    5922       65579 :                     if (abHasNoData[iBand])
    5923           3 :                         poVRTBand->SetNoDataValue(adfNoDataValue[iBand]);
    5924             :                 }
    5925             : 
    5926          42 :                 if (papoSrcBands[0]->GetMaskFlags() == GMF_PER_DATASET &&
    5927           1 :                     poVRTDS->CreateMaskBand(GMF_PER_DATASET) == CE_None)
    5928             :                 {
    5929             :                     VRTSourcedRasterBand *poMaskVRTBand =
    5930           1 :                         cpl::down_cast<VRTSourcedRasterBand *>(
    5931           1 :                             poVRTDS->GetRasterBand(1)->GetMaskBand());
    5932           1 :                     auto poSrcBand = papoSrcBands[0];
    5933           1 :                     if (iSrcOverview != -1)
    5934           0 :                         poSrcBand = papapoOverviewBands[0][iSrcOverview];
    5935           1 :                     poMaskVRTBand->AddMaskBandSource(
    5936           1 :                         poSrcBand->GetMaskBand(), 0, 0, nSrcWidth, nSrcHeight,
    5937             :                         0, 0, nDstTotalWidth, nDstTotalHeight);
    5938             :                 }
    5939             : 
    5940          41 :                 return poVRTDS;
    5941          43 :             };
    5942             : 
    5943             :             // If the overview accommodates chunking, do so and recurse
    5944             :             // to avoid generating full size temporary files
    5945          43 :             if (!bOverflowFullResXChunkYChunkQueried &&
    5946          39 :                 !bTmpDSMemRequirementOverflow && !bChunkSizeOverflow &&
    5947          39 :                 (nDstChunkXSize < nDstWidth || nDstChunkYSize < nDstHeight))
    5948             :             {
    5949             :                 // Create a VRT with the smaller chunk to do the scaling
    5950             :                 auto poVRTDS =
    5951          13 :                     CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    5952             : 
    5953          13 :                 std::vector<GDALRasterBand *> apoVRTBand(nBands);
    5954          13 :                 std::vector<GDALRasterBand *> apoDstBand(nBands);
    5955       65560 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5956             :                 {
    5957       65547 :                     apoDstBand[iBand] = papapoOverviewBands[iBand][iOverview];
    5958       65547 :                     apoVRTBand[iBand] = poVRTDS->GetRasterBand(iBand + 1);
    5959             :                 }
    5960             : 
    5961             :                 // Use a flag to avoid reading from the overview being built
    5962             :                 GDALRasterIOExtraArg sExtraArg;
    5963          13 :                 INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    5964          13 :                 if (iSrcOverview == -1)
    5965          13 :                     sExtraArg.bUseOnlyThisScale = true;
    5966             : 
    5967             :                 // A single band buffer for data transfer to the overview
    5968          13 :                 std::vector<GByte> abyChunk;
    5969             :                 try
    5970             :                 {
    5971          13 :                     abyChunk.resize(nChunkSize);
    5972             :                 }
    5973           0 :                 catch (const std::exception &)
    5974             :                 {
    5975           0 :                     CPLError(CE_Failure, CPLE_OutOfMemory,
    5976             :                              "Out of memory allocating temporary buffer");
    5977           0 :                     return CE_Failure;
    5978             :                 }
    5979             : 
    5980             :                 // Loop over output height, in chunks
    5981          13 :                 for (int nDstYOff = nDstYOffStart;
    5982          38 :                      nDstYOff < nDstYOffEnd && eErr == CE_None;
    5983             :                      /* */)
    5984             :                 {
    5985             :                     const int nDstYCount =
    5986          25 :                         std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    5987             :                     // Loop over output width, in output chunks
    5988          25 :                     for (int nDstXOff = nDstXOffStart;
    5989          74 :                          nDstXOff < nDstXOffEnd && eErr == CE_None;
    5990             :                          /* */)
    5991             :                     {
    5992             :                         const int nDstXCount =
    5993          49 :                             std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    5994             :                         // Read and transfer the chunk to the overview
    5995          98 :                         for (int iBand = 0; iBand < nBands && eErr == CE_None;
    5996             :                              ++iBand)
    5997             :                         {
    5998          98 :                             eErr = apoVRTBand[iBand]->RasterIO(
    5999             :                                 GF_Read, nDstXOff, nDstYOff, nDstXCount,
    6000          49 :                                 nDstYCount, abyChunk.data(), nDstXCount,
    6001             :                                 nDstYCount, eDataType, 0, 0, &sExtraArg);
    6002          49 :                             if (eErr == CE_None)
    6003             :                             {
    6004          96 :                                 eErr = apoDstBand[iBand]->RasterIO(
    6005             :                                     GF_Write, nDstXOff, nDstYOff, nDstXCount,
    6006          48 :                                     nDstYCount, abyChunk.data(), nDstXCount,
    6007             :                                     nDstYCount, eDataType, 0, 0, nullptr);
    6008             :                             }
    6009             :                         }
    6010             : 
    6011          49 :                         dfCurPixelCount +=
    6012          49 :                             static_cast<double>(nDstXCount) * nDstYCount;
    6013             : 
    6014          49 :                         nDstXOff += nDstXCount;
    6015             :                     }  // width
    6016             : 
    6017          25 :                     if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount,
    6018             :                                      nullptr, pProgressData))
    6019             :                     {
    6020           0 :                         CPLError(CE_Failure, CPLE_UserInterrupt,
    6021             :                                  "User terminated");
    6022           0 :                         eErr = CE_Failure;
    6023             :                     }
    6024             : 
    6025          25 :                     nDstYOff += nDstYCount;
    6026             :                 }  // height
    6027             : 
    6028          13 :                 if (CE_None != eErr)
    6029             :                 {
    6030           1 :                     CPLError(CE_Failure, CPLE_AppDefined,
    6031             :                              "Error while writing overview");
    6032           1 :                     return CE_Failure;
    6033             :                 }
    6034             : 
    6035          12 :                 pfnProgress(1.0, nullptr, pProgressData);
    6036             :                 // Flush the overviews we just generated
    6037          24 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    6038          12 :                     apoDstBand[iBand]->FlushCache(false);
    6039             : 
    6040          12 :                 continue;  // Next overview
    6041             :             }              // chunking via temporary dataset
    6042             : 
    6043           0 :             std::unique_ptr<GDALDataset> poTmpDS;
    6044             :             // Config option mostly/only for autotest purposes
    6045             :             const char *pszGDAL_OVR_TEMP_DRIVER =
    6046          30 :                 CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
    6047          30 :             if ((!bTmpDSMemRequirementOverflow &&
    6048           4 :                  nTmpDSMemRequirement <= nChunkMaxSizeForTempFile &&
    6049           4 :                  !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
    6050          26 :                 EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
    6051             :             {
    6052          10 :                 auto poTmpDrv = GetGDALDriverManager()->GetDriverByName("MEM");
    6053          10 :                 if (!poTmpDrv)
    6054             :                 {
    6055           0 :                     eErr = CE_Failure;
    6056           0 :                     break;
    6057             :                 }
    6058          10 :                 poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
    6059             :                                                nDstTotalHeight, nBands,
    6060          10 :                                                eDataType, nullptr));
    6061             :             }
    6062             :             else
    6063             :             {
    6064             :                 // Create a temporary file for the overview
    6065             :                 auto poTmpDrv =
    6066          20 :                     GetGDALDriverManager()->GetDriverByName("GTiff");
    6067          20 :                 if (!poTmpDrv)
    6068             :                 {
    6069           0 :                     eErr = CE_Failure;
    6070           0 :                     break;
    6071             :                 }
    6072          40 :                 std::string osTmpFilename;
    6073          20 :                 auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
    6074          20 :                 if (poDstDS)
    6075             :                 {
    6076          20 :                     osTmpFilename = poDstDS->GetDescription();
    6077             :                     VSIStatBufL sStatBuf;
    6078          20 :                     if (!osTmpFilename.empty() &&
    6079           0 :                         VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
    6080           0 :                         osTmpFilename += "_tmp_ovr.tif";
    6081             :                 }
    6082          20 :                 if (osTmpFilename.empty())
    6083             :                 {
    6084          20 :                     osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
    6085          20 :                     osTmpFilename += ".tif";
    6086             :                 }
    6087          20 :                 CPLDebug("GDAL", "Creating temporary file %s of %d x %d x %d",
    6088             :                          osTmpFilename.c_str(), nDstWidth, nDstHeight, nBands);
    6089          40 :                 CPLStringList aosCO;
    6090          20 :                 if (0 == ((nReducedDstChunkXSize % GTIFF_BLOCK_SIZE_MULTIPLE) |
    6091          20 :                           (nReducedDstChunkYSize % GTIFF_BLOCK_SIZE_MULTIPLE)))
    6092             :                 {
    6093          14 :                     aosCO.SetNameValue("TILED", "YES");
    6094             :                     aosCO.SetNameValue("BLOCKXSIZE",
    6095          14 :                                        CPLSPrintf("%d", nReducedDstChunkXSize));
    6096             :                     aosCO.SetNameValue("BLOCKYSIZE",
    6097          14 :                                        CPLSPrintf("%d", nReducedDstChunkYSize));
    6098             :                 }
    6099          20 :                 if (const char *pszCOList =
    6100          20 :                         poTmpDrv->GetMetadataItem(GDAL_DMD_CREATIONOPTIONLIST))
    6101             :                 {
    6102             :                     aosCO.SetNameValue(
    6103          20 :                         "COMPRESS", strstr(pszCOList, "ZSTD") ? "ZSTD" : "LZW");
    6104             :                 }
    6105          20 :                 poTmpDS.reset(poTmpDrv->Create(osTmpFilename.c_str(), nDstWidth,
    6106             :                                                nDstHeight, nBands, eDataType,
    6107          20 :                                                aosCO.List()));
    6108          20 :                 if (poTmpDS)
    6109             :                 {
    6110          18 :                     poTmpDS->MarkSuppressOnClose();
    6111          18 :                     VSIUnlink(osTmpFilename.c_str());
    6112             :                 }
    6113             :             }
    6114          30 :             if (!poTmpDS)
    6115             :             {
    6116           2 :                 eErr = CE_Failure;
    6117           2 :                 break;
    6118             :             }
    6119             : 
    6120             :             // Create a full size VRT to do the resampling without edge effects
    6121             :             auto poVRTDS =
    6122          28 :                 CreateVRT(nReducedDstChunkXSize, nReducedDstChunkYSize);
    6123             : 
    6124             :             // Allocate a band buffer with the overview chunk size
    6125             :             std::unique_ptr<void, VSIFreeReleaser> pDstBuffer(
    6126             :                 VSI_MALLOC3_VERBOSE(size_t(nWrkDataTypeSize), nDstChunkXSize,
    6127          28 :                                     nDstChunkYSize));
    6128          28 :             if (pDstBuffer == nullptr)
    6129             :             {
    6130           0 :                 eErr = CE_Failure;
    6131           0 :                 break;
    6132             :             }
    6133             : 
    6134             :             // Use a flag to avoid reading the overview being built
    6135             :             GDALRasterIOExtraArg sExtraArg;
    6136          28 :             INIT_RASTERIO_EXTRA_ARG(sExtraArg);
    6137          28 :             if (iSrcOverview == -1)
    6138           4 :                 sExtraArg.bUseOnlyThisScale = true;
    6139             : 
    6140             :             // Scale and copy data from the VRT to the temp file
    6141          28 :             for (int nDstYOff = nDstYOffStart;
    6142         914 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    6143             :                  /* */)
    6144             :             {
    6145             :                 const int nDstYCount =
    6146         886 :                     std::min(nReducedDstChunkYSize, nDstYOffEnd - nDstYOff);
    6147         886 :                 for (int nDstXOff = nDstXOffStart;
    6148      201218 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    6149             :                      /* */)
    6150             :                 {
    6151             :                     const int nDstXCount =
    6152      200332 :                         std::min(nReducedDstChunkXSize, nDstXOffEnd - nDstXOff);
    6153      400668 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    6154             :                          ++iBand)
    6155             :                     {
    6156      200336 :                         auto poSrcBand = poVRTDS->GetRasterBand(iBand + 1);
    6157      200336 :                         eErr = poSrcBand->RasterIO(
    6158             :                             GF_Read, nDstXOff, nDstYOff, nDstXCount, nDstYCount,
    6159             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    6160             :                             eWrkDataType, 0, 0, &sExtraArg);
    6161      200336 :                         if (eErr == CE_None)
    6162             :                         {
    6163             :                             // Write to the temporary dataset, shifted
    6164      200334 :                             auto poOvrBand = poTmpDS->GetRasterBand(iBand + 1);
    6165      200334 :                             eErr = poOvrBand->RasterIO(
    6166             :                                 GF_Write, nDstXOff - nDstXOffStart,
    6167             :                                 nDstYOff - nDstYOffStart, nDstXCount,
    6168             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    6169             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    6170             :                         }
    6171             :                     }
    6172      200332 :                     nDstXOff += nDstXCount;
    6173             :                 }
    6174         886 :                 nDstYOff += nDstYCount;
    6175             :             }
    6176             : 
    6177             :             // Copy from the temporary to the overview
    6178          28 :             for (int nDstYOff = nDstYOffStart;
    6179          54 :                  nDstYOff < nDstYOffEnd && eErr == CE_None;
    6180             :                  /* */)
    6181             :             {
    6182             :                 const int nDstYCount =
    6183          26 :                     std::min(nDstChunkYSize, nDstYOffEnd - nDstYOff);
    6184          26 :                 for (int nDstXOff = nDstXOffStart;
    6185          52 :                      nDstXOff < nDstXOffEnd && eErr == CE_None;
    6186             :                      /* */)
    6187             :                 {
    6188             :                     const int nDstXCount =
    6189          26 :                         std::min(nDstChunkXSize, nDstXOffEnd - nDstXOff);
    6190          56 :                     for (int iBand = 0; iBand < nBands && eErr == CE_None;
    6191             :                          ++iBand)
    6192             :                     {
    6193          30 :                         auto poSrcBand = poTmpDS->GetRasterBand(iBand + 1);
    6194          30 :                         eErr = poSrcBand->RasterIO(
    6195             :                             GF_Read, nDstXOff - nDstXOffStart,
    6196             :                             nDstYOff - nDstYOffStart, nDstXCount, nDstYCount,
    6197             :                             pDstBuffer.get(), nDstXCount, nDstYCount,
    6198             :                             eWrkDataType, 0, 0, nullptr);
    6199          30 :                         if (eErr == CE_None)
    6200             :                         {
    6201             :                             // Write to the destination overview bands
    6202          30 :                             auto poOvrBand =
    6203          30 :                                 papapoOverviewBands[iBand][iOverview];
    6204          30 :                             eErr = poOvrBand->RasterIO(
    6205             :                                 GF_Write, nDstXOff, nDstYOff, nDstXCount,
    6206             :                                 nDstYCount, pDstBuffer.get(), nDstXCount,
    6207             :                                 nDstYCount, eWrkDataType, 0, 0, nullptr);
    6208             :                         }
    6209             :                     }
    6210          26 :                     nDstXOff += nDstXCount;
    6211             :                 }
    6212          26 :                 nDstYOff += nDstYCount;
    6213             :             }
    6214             : 
    6215          28 :             if (eErr != CE_None)
    6216             :             {
    6217           2 :                 CPLError(CE_Failure, CPLE_AppDefined,
    6218             :                          "Failed to write overview %d", iOverview);
    6219           2 :                 return eErr;
    6220             :             }
    6221             : 
    6222             :             // Flush the data to overviews.
    6223          56 :             for (int iBand = 0; iBand < nBands; ++iBand)
    6224          30 :                 papapoOverviewBands[iBand][iOverview]->FlushCache(false);
    6225             : 
    6226          26 :             continue;
    6227             :         }
    6228             : 
    6229             :         // Structure describing a resampling job
    6230             :         struct OvrJob
    6231             :         {
    6232             :             // Buffers to free when job is finished
    6233             :             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
    6234             :             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
    6235             :             std::unique_ptr<PointerHolder> oDstBufferHolder{};
    6236             : 
    6237             :             GDALRasterBand *poDstBand = nullptr;
    6238             : 
    6239             :             // Input parameters of pfnResampleFn
    6240             :             GDALResampleFunction pfnResampleFn = nullptr;
    6241             :             GDALOverviewResampleArgs args{};
    6242             :             const void *pChunk = nullptr;
    6243             : 
    6244             :             // Output values of resampling function
    6245             :             CPLErr eErr = CE_Failure;
    6246             :             void *pDstBuffer = nullptr;
    6247             :             GDALDataType eDstBufferDataType = GDT_Unknown;
    6248             : 
    6249        3310 :             void NotifyFinished()
    6250             :             {
    6251        6620 :                 std::lock_guard guard(mutex);
    6252        3310 :                 bFinished = true;
    6253        3310 :                 cv.notify_one();
    6254        3310 :             }
    6255             : 
    6256           2 :             bool IsFinished()
    6257             :             {
    6258           2 :                 std::lock_guard guard(mutex);
    6259           4 :                 return bFinished;
    6260             :             }
    6261             : 
    6262          16 :             void WaitFinished()
    6263             :             {
    6264          32 :                 std::unique_lock oGuard(mutex);
    6265          21 :                 while (!bFinished)
    6266             :                 {
    6267           5 :                     cv.wait(oGuard);
    6268             :                 }
    6269          16 :             }
    6270             : 
    6271             :           private:
    6272             :             // Synchronization
    6273             :             bool bFinished = false;
    6274             :             std::mutex mutex{};
    6275             :             std::condition_variable cv{};
    6276             :         };
    6277             : 
    6278             :         // Thread function to resample
    6279        3310 :         const auto JobResampleFunc = [](void *pData)
    6280             :         {
    6281        3310 :             OvrJob *poJob = static_cast<OvrJob *>(pData);
    6282             : 
    6283        3310 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    6284             :                                                &(poJob->pDstBuffer),
    6285             :                                                &(poJob->eDstBufferDataType));
    6286             : 
    6287        3310 :             poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
    6288             : 
    6289        3310 :             poJob->NotifyFinished();
    6290        3310 :         };
    6291             : 
    6292             :         // Function to write resample data to target band
    6293        3310 :         const auto WriteJobData = [](const OvrJob *poJob)
    6294             :         {
    6295        6620 :             return poJob->poDstBand->RasterIO(
    6296        3310 :                 GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
    6297        3310 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    6298        3310 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    6299        3310 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    6300        3310 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    6301        3310 :                 poJob->eDstBufferDataType, 0, 0, nullptr);
    6302             :         };
    6303             : 
    6304             :         // Wait for completion of oldest job and serialize it
    6305             :         const auto WaitAndFinalizeOldestJob =
    6306          16 :             [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    6307             :         {
    6308          16 :             auto poOldestJob = jobList.front().get();
    6309          16 :             poOldestJob->WaitFinished();
    6310          16 :             CPLErr l_eErr = poOldestJob->eErr;
    6311          16 :             if (l_eErr == CE_None)
    6312             :             {
    6313          16 :                 l_eErr = WriteJobData(poOldestJob);
    6314             :             }
    6315             : 
    6316          16 :             jobList.pop_front();
    6317          16 :             return l_eErr;
    6318             :         };
    6319             : 
    6320             :         // Queue of jobs
    6321        1204 :         std::list<std::unique_ptr<OvrJob>> jobList;
    6322             : 
    6323        1204 :         std::vector<std::unique_ptr<void, VSIFreeReleaser>> apaChunk(nBands);
    6324             :         std::vector<std::unique_ptr<GByte, VSIFreeReleaser>>
    6325        1204 :             apabyChunkNoDataMask(nBands);
    6326             : 
    6327             :         // Iterate on destination overview, block by block.
    6328         602 :         for (int nDstYOff = nDstYOffStart;
    6329        2111 :              nDstYOff < nDstYOffEnd && eErr == CE_None;
    6330        1509 :              nDstYOff += nDstChunkYSize)
    6331             :         {
    6332             :             int nDstYCount;
    6333        1509 :             if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
    6334        1099 :                 nDstYCount = nDstChunkYSize;
    6335             :             else
    6336         410 :                 nDstYCount = nDstYOffEnd - nDstYOff;
    6337             : 
    6338        1509 :             int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    6339        1509 :             int nChunkYOff2 = static_cast<int>(
    6340        1509 :                 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
    6341        1509 :             if (nChunkYOff2 > nSrcHeight ||
    6342        1509 :                 nDstYOff + nDstYCount == nDstTotalHeight)
    6343         595 :                 nChunkYOff2 = nSrcHeight;
    6344        1509 :             int nYCount = nChunkYOff2 - nChunkYOff;
    6345        1509 :             CPLAssert(nYCount <= nFullResYChunk);
    6346             : 
    6347        1509 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    6348        1509 :             int nChunkYSizeQueried =
    6349        1509 :                 nYCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    6350        1509 :             if (nChunkYOffQueried < 0)
    6351             :             {
    6352         144 :                 nChunkYSizeQueried += nChunkYOffQueried;
    6353         144 :                 nChunkYOffQueried = 0;
    6354             :             }
    6355        1509 :             if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
    6356         143 :                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
    6357        1509 :             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
    6358             : 
    6359        1509 :             if (!pfnProgress(std::min(1.0, dfCurPixelCount / dfTotalPixelCount),
    6360             :                              nullptr, pProgressData))
    6361             :             {
    6362           1 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6363           1 :                 eErr = CE_Failure;
    6364             :             }
    6365             : 
    6366             :             // Iterate on destination overview, block by block.
    6367        1509 :             for (int nDstXOff = nDstXOffStart;
    6368        3057 :                  nDstXOff < nDstXOffEnd && eErr == CE_None;
    6369        1548 :                  nDstXOff += nDstChunkXSize)
    6370             :             {
    6371        1548 :                 int nDstXCount = 0;
    6372        1548 :                 if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
    6373        1531 :                     nDstXCount = nDstChunkXSize;
    6374             :                 else
    6375          17 :                     nDstXCount = nDstXOffEnd - nDstXOff;
    6376             : 
    6377        1548 :                 dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
    6378             : 
    6379        1548 :                 int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    6380        1548 :                 int nChunkXOff2 = static_cast<int>(
    6381        1548 :                     ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    6382        1548 :                 if (nChunkXOff2 > nSrcWidth ||
    6383        1548 :                     nDstXOff + nDstXCount == nDstTotalWidth)
    6384        1473 :                     nChunkXOff2 = nSrcWidth;
    6385        1548 :                 const int nXCount = nChunkXOff2 - nChunkXOff;
    6386        1548 :                 CPLAssert(nXCount <= nFullResXChunk);
    6387             : 
    6388        1548 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    6389        1548 :                 int nChunkXSizeQueried =
    6390        1548 :                     nXCount + RADIUS_TO_DIAMETER * nKernelRadius * nOvrFactor;
    6391        1548 :                 if (nChunkXOffQueried < 0)
    6392             :                 {
    6393         203 :                     nChunkXSizeQueried += nChunkXOffQueried;
    6394         203 :                     nChunkXOffQueried = 0;
    6395             :                 }
    6396        1548 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
    6397         212 :                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
    6398        1548 :                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
    6399             : #if DEBUG_VERBOSE
    6400             :                 CPLDebug("GDAL",
    6401             :                          "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
    6402             :                          nChunkXOffQueried, nChunkYOffQueried,
    6403             :                          nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
    6404             :                          nDstYOff, nDstXCount, nDstYCount);
    6405             : #endif
    6406             : 
    6407             :                 // Avoid accumulating too many tasks and exhaust RAM
    6408             : 
    6409             :                 // Try to complete already finished jobs
    6410        1548 :                 while (eErr == CE_None && !jobList.empty())
    6411             :                 {
    6412           2 :                     auto poOldestJob = jobList.front().get();
    6413           2 :                     if (!poOldestJob->IsFinished())
    6414           2 :                         break;
    6415           0 :                     eErr = poOldestJob->eErr;
    6416           0 :                     if (eErr == CE_None)
    6417             :                     {
    6418           0 :                         eErr = WriteJobData(poOldestJob);
    6419             :                     }
    6420             : 
    6421           0 :                     jobList.pop_front();
    6422             :                 }
    6423             : 
    6424             :                 // And in case we have saturated the number of threads,
    6425             :                 // wait for completion of tasks to go below the threshold.
    6426        3096 :                 while (eErr == CE_None &&
    6427        1548 :                        jobList.size() >= static_cast<size_t>(nThreads))
    6428             :                 {
    6429           0 :                     eErr = WaitAndFinalizeOldestJob(jobList);
    6430             :                 }
    6431             : 
    6432             :                 // Read the source buffers for all the bands.
    6433        4859 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6434             :                 {
    6435             :                     // (Re)allocate buffers if needed
    6436        3311 :                     if (apaChunk[iBand] == nullptr)
    6437             :                     {
    6438        1179 :                         apaChunk[iBand].reset(VSI_MALLOC3_VERBOSE(
    6439             :                             nFullResXChunkQueried, nFullResYChunkQueried,
    6440             :                             nWrkDataTypeSize));
    6441        1179 :                         if (apaChunk[iBand] == nullptr)
    6442             :                         {
    6443           0 :                             eErr = CE_Failure;
    6444             :                         }
    6445             :                     }
    6446        3652 :                     if (bUseNoDataMask &&
    6447         341 :                         apabyChunkNoDataMask[iBand] == nullptr)
    6448             :                     {
    6449         282 :                         apabyChunkNoDataMask[iBand].reset(
    6450         282 :                             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    6451             :                                 nFullResXChunkQueried, nFullResYChunkQueried)));
    6452         282 :                         if (apabyChunkNoDataMask[iBand] == nullptr)
    6453             :                         {
    6454           0 :                             eErr = CE_Failure;
    6455             :                         }
    6456             :                     }
    6457             : 
    6458        3311 :                     if (eErr == CE_None)
    6459             :                     {
    6460        3311 :                         GDALRasterBand *poSrcBand = nullptr;
    6461        3311 :                         if (iSrcOverview == -1)
    6462        2409 :                             poSrcBand = papoSrcBands[iBand];
    6463             :                         else
    6464         902 :                             poSrcBand =
    6465         902 :                                 papapoOverviewBands[iBand][iSrcOverview];
    6466        3311 :                         eErr = poSrcBand->RasterIO(
    6467             :                             GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6468             :                             nChunkXSizeQueried, nChunkYSizeQueried,
    6469        3311 :                             apaChunk[iBand].get(), nChunkXSizeQueried,
    6470             :                             nChunkYSizeQueried, eWrkDataType, 0, 0, nullptr);
    6471             : 
    6472        3311 :                         if (bUseNoDataMask && eErr == CE_None)
    6473             :                         {
    6474         341 :                             auto poMaskBand = poSrcBand->IsMaskBand()
    6475         341 :                                                   ? poSrcBand
    6476         262 :                                                   : poSrcBand->GetMaskBand();
    6477         341 :                             eErr = poMaskBand->RasterIO(
    6478             :                                 GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    6479             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6480         341 :                                 apabyChunkNoDataMask[iBand].get(),
    6481             :                                 nChunkXSizeQueried, nChunkYSizeQueried,
    6482             :                                 GDT_Byte, 0, 0, nullptr);
    6483             :                         }
    6484             :                     }
    6485             :                 }
    6486             : 
    6487             :                 // Compute the resulting overview block.
    6488        4858 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    6489             :                 {
    6490        6620 :                     auto poJob = std::make_unique<OvrJob>();
    6491        3310 :                     poJob->pfnResampleFn = pfnResampleFn;
    6492        3310 :                     poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
    6493        6620 :                     poJob->args.eOvrDataType =
    6494        3310 :                         poJob->poDstBand->GetRasterDataType();
    6495        3310 :                     poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
    6496        3310 :                     poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
    6497        3310 :                     const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
    6498        3310 :                         "NBITS", "IMAGE_STRUCTURE");
    6499        3310 :                     poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    6500        3310 :                     poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    6501        3310 :                     poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    6502        3310 :                     poJob->args.eWrkDataType = eWrkDataType;
    6503        3310 :                     poJob->pChunk = apaChunk[iBand].get();
    6504        3310 :                     poJob->args.pabyChunkNodataMask =
    6505        3310 :                         apabyChunkNoDataMask[iBand].get();
    6506        3310 :                     poJob->args.nChunkXOff = nChunkXOffQueried;
    6507        3310 :                     poJob->args.nChunkXSize = nChunkXSizeQueried;
    6508        3310 :                     poJob->args.nChunkYOff = nChunkYOffQueried;
    6509        3310 :                     poJob->args.nChunkYSize = nChunkYSizeQueried;
    6510        3310 :                     poJob->args.nDstXOff = nDstXOff;
    6511        3310 :                     poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
    6512        3310 :                     poJob->args.nDstYOff = nDstYOff;
    6513        3310 :                     poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
    6514        3310 :                     poJob->args.pszResampling = pszResampling;
    6515        3310 :                     poJob->args.bHasNoData = abHasNoData[iBand];
    6516        3310 :                     poJob->args.dfNoDataValue = adfNoDataValue[iBand];
    6517        3310 :                     poJob->args.eSrcDataType = eDataType;
    6518        3310 :                     poJob->args.bPropagateNoData = bPropagateNoData;
    6519             : 
    6520        3310 :                     if (poJobQueue)
    6521             :                     {
    6522          32 :                         poJob->oSrcMaskBufferHolder.reset(new PointerHolder(
    6523          16 :                             apabyChunkNoDataMask[iBand].release()));
    6524             : 
    6525          32 :                         poJob->oSrcBufferHolder.reset(
    6526          16 :                             new PointerHolder(apaChunk[iBand].release()));
    6527             : 
    6528          16 :                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    6529          16 :                         jobList.emplace_back(std::move(poJob));
    6530             :                     }
    6531             :                     else
    6532             :                     {
    6533        3294 :                         JobResampleFunc(poJob.get());
    6534        3294 :                         eErr = poJob->eErr;
    6535        3294 :                         if (eErr == CE_None)
    6536             :                         {
    6537        3294 :                             eErr = WriteJobData(poJob.get());
    6538             :                         }
    6539             :                     }
    6540             :                 }
    6541             :             }
    6542             :         }
    6543             : 
    6544             :         // Wait for all pending jobs to complete
    6545         618 :         while (!jobList.empty())
    6546             :         {
    6547          16 :             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    6548          16 :             if (l_eErr != CE_None && eErr == CE_None)
    6549           0 :                 eErr = l_eErr;
    6550             :         }
    6551             : 
    6552             :         // Flush the data to overviews.
    6553        1779 :         for (int iBand = 0; iBand < nBands; ++iBand)
    6554             :         {
    6555        1177 :             if (papapoOverviewBands[iBand][iOverview]->FlushCache(false) !=
    6556             :                 CE_None)
    6557           0 :                 eErr = CE_Failure;
    6558             :         }
    6559             :     }
    6560             : 
    6561         382 :     if (eErr == CE_None)
    6562         378 :         pfnProgress(1.0, nullptr, pProgressData);
    6563             : 
    6564         382 :     return eErr;
    6565             : }
    6566             : 
    6567             : /************************************************************************/
    6568             : /*            GDALRegenerateOverviewsMultiBand()                        */
    6569             : /************************************************************************/
    6570             : 
    6571             : /**
    6572             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    6573             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    6574             :  *
    6575             :  * This function will generate one or more overview images from a base
    6576             :  * image using the requested downsampling algorithm.  Its primary use
    6577             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    6578             :  * can also be used to generate downsampled images in one file from another
    6579             :  * outside the overview architecture.
    6580             :  *
    6581             :  * The output bands need to exist in advance and share the same characteristics
    6582             :  * (type, dimensions)
    6583             :  *
    6584             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    6585             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    6586             :  *
    6587             :  * It does not support color tables or complex data types.
    6588             :  *
    6589             :  * The pseudo-algorithm used by the function is :
    6590             :  *    for each overview
    6591             :  *       iterate on lines of the source by a step of deltay
    6592             :  *           iterate on columns of the source  by a step of deltax
    6593             :  *               read the source data of size deltax * deltay for all the bands
    6594             :  *               generate the corresponding overview block for all the bands
    6595             :  *
    6596             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    6597             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    6598             :  * considered as the nodata value and not each value of the triplet
    6599             :  * independently per band.
    6600             :  *
    6601             :  * The GDAL_NUM_THREADS configuration option can be set
    6602             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    6603             :  * overview computation.
    6604             :  *
    6605             :  * @param apoSrcBands the list of source bands to downsample
    6606             :  * @param aapoOverviewBands bidimension array of bands. First dimension is
    6607             :  *                          indexed by bands. Second dimension is indexed by
    6608             :  *                          overview levels. All aapoOverviewBands[i] arrays
    6609             :  *                          must have the same size (i.e. same number of
    6610             :  *                          overviews)
    6611             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    6612             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    6613             :  * @param pfnProgress progress report function.
    6614             :  * @param pProgressData progress function callback data.
    6615             :  * @param papszOptions NULL terminated list of options as
    6616             :  *                     key=value pairs, or NULL
    6617             :  *                     The XOFF, YOFF, XSIZE and YSIZE
    6618             :  *                     options can be specified to express that overviews should
    6619             :  *                     be regenerated only in the specified subset of the source
    6620             :  *                     dataset.
    6621             :  * @return CE_None on success or CE_Failure on failure.
    6622             :  * @since 3.10
    6623             :  */
    6624             : 
    6625          19 : CPLErr GDALRegenerateOverviewsMultiBand(
    6626             :     const std::vector<GDALRasterBand *> &apoSrcBands,
    6627             :     const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
    6628             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    6629             :     void *pProgressData, CSLConstList papszOptions)
    6630             : {
    6631          19 :     CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
    6632          29 :     for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
    6633             :     {
    6634          10 :         CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
    6635             :     }
    6636             : 
    6637          19 :     if (aapoOverviewBands.empty())
    6638           0 :         return CE_None;
    6639             : 
    6640          19 :     std::vector<GDALRasterBand **> apapoOverviewBands;
    6641          48 :     for (auto &apoOverviewBands : aapoOverviewBands)
    6642             :     {
    6643             :         auto papoOverviewBands = static_cast<GDALRasterBand **>(
    6644          29 :             CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
    6645          61 :         for (size_t i = 0; i < apoOverviewBands.size(); ++i)
    6646             :         {
    6647          32 :             papoOverviewBands[i] = apoOverviewBands[i];
    6648             :         }
    6649          29 :         apapoOverviewBands.push_back(papoOverviewBands);
    6650             :     }
    6651          38 :     const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
    6652          19 :         static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
    6653          19 :         static_cast<int>(aapoOverviewBands[0].size()),
    6654          19 :         apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
    6655             :         papszOptions);
    6656          48 :     for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
    6657          29 :         CPLFree(papoOverviewBands);
    6658          19 :     return eErr;
    6659             : }
    6660             : 
    6661             : /************************************************************************/
    6662             : /*                        GDALComputeBandStats()                        */
    6663             : /************************************************************************/
    6664             : 
    6665             : /** Undocumented
    6666             :  * @param hSrcBand undocumented.
    6667             :  * @param nSampleStep Step between scanlines used to compute statistics.
    6668             :  *                    When nSampleStep is equal to 1, all scanlines will
    6669             :  *                    be processed.
    6670             :  * @param pdfMean undocumented.
    6671             :  * @param pdfStdDev undocumented.
    6672             :  * @param pfnProgress undocumented.
    6673             :  * @param pProgressData undocumented.
    6674             :  * @return undocumented
    6675             :  */
    6676          18 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
    6677             :                                         int nSampleStep, double *pdfMean,
    6678             :                                         double *pdfStdDev,
    6679             :                                         GDALProgressFunc pfnProgress,
    6680             :                                         void *pProgressData)
    6681             : 
    6682             : {
    6683          18 :     VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
    6684             : 
    6685          18 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    6686             : 
    6687          18 :     if (pfnProgress == nullptr)
    6688          18 :         pfnProgress = GDALDummyProgress;
    6689             : 
    6690          18 :     const int nWidth = poSrcBand->GetXSize();
    6691          18 :     const int nHeight = poSrcBand->GetYSize();
    6692             : 
    6693          18 :     if (nSampleStep >= nHeight || nSampleStep < 1)
    6694           5 :         nSampleStep = 1;
    6695             : 
    6696          18 :     GDALDataType eWrkType = GDT_Unknown;
    6697          18 :     float *pafData = nullptr;
    6698          18 :     GDALDataType eType = poSrcBand->GetRasterDataType();
    6699          18 :     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6700          18 :     if (bComplex)
    6701             :     {
    6702             :         pafData = static_cast<float *>(
    6703           0 :             VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6704           0 :         eWrkType = GDT_CFloat32;
    6705             :     }
    6706             :     else
    6707             :     {
    6708             :         pafData =
    6709          18 :             static_cast<float *>(VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6710          18 :         eWrkType = GDT_Float32;
    6711             :     }
    6712             : 
    6713          18 :     if (nWidth == 0 || pafData == nullptr)
    6714             :     {
    6715           0 :         VSIFree(pafData);
    6716           0 :         return CE_Failure;
    6717             :     }
    6718             : 
    6719             :     /* -------------------------------------------------------------------- */
    6720             :     /*      Loop over all sample lines.                                     */
    6721             :     /* -------------------------------------------------------------------- */
    6722          18 :     double dfSum = 0.0;
    6723          18 :     double dfSum2 = 0.0;
    6724          18 :     int iLine = 0;
    6725          18 :     GIntBig nSamples = 0;
    6726             : 
    6727        2143 :     do
    6728             :     {
    6729        2161 :         if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6730             :                          pProgressData))
    6731             :         {
    6732           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6733           0 :             CPLFree(pafData);
    6734           0 :             return CE_Failure;
    6735             :         }
    6736             : 
    6737             :         const CPLErr eErr =
    6738        2161 :             poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
    6739             :                                 1, eWrkType, 0, 0, nullptr);
    6740        2161 :         if (eErr != CE_None)
    6741             :         {
    6742           1 :             CPLFree(pafData);
    6743           1 :             return eErr;
    6744             :         }
    6745             : 
    6746      725208 :         for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6747             :         {
    6748      723048 :             float fValue = 0.0f;
    6749             : 
    6750      723048 :             if (bComplex)
    6751             :             {
    6752             :                 // Compute the magnitude of the complex value.
    6753             :                 fValue =
    6754           0 :                     std::hypot(pafData[static_cast<size_t>(iPixel) * 2],
    6755           0 :                                pafData[static_cast<size_t>(iPixel) * 2 + 1]);
    6756             :             }
    6757             :             else
    6758             :             {
    6759      723048 :                 fValue = pafData[iPixel];
    6760             :             }
    6761             : 
    6762      723048 :             dfSum += static_cast<double>(fValue);
    6763      723048 :             dfSum2 += static_cast<double>(fValue) * static_cast<double>(fValue);
    6764             :         }
    6765             : 
    6766        2160 :         nSamples += nWidth;
    6767        2160 :         iLine += nSampleStep;
    6768        2160 :     } while (iLine < nHeight);
    6769             : 
    6770          17 :     if (!pfnProgress(1.0, nullptr, pProgressData))
    6771             :     {
    6772           0 :         CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6773           0 :         CPLFree(pafData);
    6774           0 :         return CE_Failure;
    6775             :     }
    6776             : 
    6777             :     /* -------------------------------------------------------------------- */
    6778             :     /*      Produce the result values.                                      */
    6779             :     /* -------------------------------------------------------------------- */
    6780          17 :     if (pdfMean != nullptr)
    6781          17 :         *pdfMean = dfSum / nSamples;
    6782             : 
    6783          17 :     if (pdfStdDev != nullptr)
    6784             :     {
    6785          17 :         const double dfMean = dfSum / nSamples;
    6786             : 
    6787          17 :         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
    6788             :     }
    6789             : 
    6790          17 :     CPLFree(pafData);
    6791             : 
    6792          17 :     return CE_None;
    6793             : }
    6794             : 
    6795             : /************************************************************************/
    6796             : /*                  GDALOverviewMagnitudeCorrection()                   */
    6797             : /*                                                                      */
    6798             : /*      Correct the mean and standard deviation of the overviews of     */
    6799             : /*      the given band to match the base layer approximately.           */
    6800             : /************************************************************************/
    6801             : 
    6802             : /** Undocumented
    6803             :  * @param hBaseBand undocumented.
    6804             :  * @param nOverviewCount undocumented.
    6805             :  * @param pahOverviews undocumented.
    6806             :  * @param pfnProgress undocumented.
    6807             :  * @param pProgressData undocumented.
    6808             :  * @return undocumented
    6809             :  */
    6810           0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
    6811             :                                        int nOverviewCount,
    6812             :                                        GDALRasterBandH *pahOverviews,
    6813             :                                        GDALProgressFunc pfnProgress,
    6814             :                                        void *pProgressData)
    6815             : 
    6816             : {
    6817           0 :     VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
    6818             : 
    6819             :     /* -------------------------------------------------------------------- */
    6820             :     /*      Compute mean/stddev for source raster.                          */
    6821             :     /* -------------------------------------------------------------------- */
    6822           0 :     double dfOrigMean = 0.0;
    6823           0 :     double dfOrigStdDev = 0.0;
    6824             :     {
    6825             :         const CPLErr eErr =
    6826           0 :             GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
    6827             :                                  pfnProgress, pProgressData);
    6828             : 
    6829           0 :         if (eErr != CE_None)
    6830           0 :             return eErr;
    6831             :     }
    6832             : 
    6833             :     /* -------------------------------------------------------------------- */
    6834             :     /*      Loop on overview bands.                                         */
    6835             :     /* -------------------------------------------------------------------- */
    6836           0 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    6837             :     {
    6838             :         GDALRasterBand *poOverview =
    6839           0 :             GDALRasterBand::FromHandle(pahOverviews[iOverview]);
    6840             :         double dfOverviewMean, dfOverviewStdDev;
    6841             : 
    6842             :         const CPLErr eErr =
    6843           0 :             GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
    6844             :                                  &dfOverviewStdDev, pfnProgress, pProgressData);
    6845             : 
    6846           0 :         if (eErr != CE_None)
    6847           0 :             return eErr;
    6848             : 
    6849           0 :         double dfGain = 1.0;
    6850           0 :         if (dfOrigStdDev >= 0.0001)
    6851           0 :             dfGain = dfOrigStdDev / dfOverviewStdDev;
    6852             : 
    6853             :         /* --------------------------------------------------------------------
    6854             :          */
    6855             :         /*      Apply gain and offset. */
    6856             :         /* --------------------------------------------------------------------
    6857             :          */
    6858           0 :         const int nWidth = poOverview->GetXSize();
    6859           0 :         const int nHeight = poOverview->GetYSize();
    6860             : 
    6861           0 :         GDALDataType eWrkType = GDT_Unknown;
    6862           0 :         float *pafData = nullptr;
    6863           0 :         const GDALDataType eType = poOverview->GetRasterDataType();
    6864           0 :         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6865           0 :         if (bComplex)
    6866             :         {
    6867             :             pafData = static_cast<float *>(
    6868           0 :                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6869           0 :             eWrkType = GDT_CFloat32;
    6870             :         }
    6871             :         else
    6872             :         {
    6873             :             pafData = static_cast<float *>(
    6874           0 :                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6875           0 :             eWrkType = GDT_Float32;
    6876             :         }
    6877             : 
    6878           0 :         if (pafData == nullptr)
    6879             :         {
    6880           0 :             return CE_Failure;
    6881             :         }
    6882             : 
    6883           0 :         for (int iLine = 0; iLine < nHeight; ++iLine)
    6884             :         {
    6885           0 :             if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6886             :                              pProgressData))
    6887             :             {
    6888           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6889           0 :                 CPLFree(pafData);
    6890           0 :                 return CE_Failure;
    6891             :             }
    6892             : 
    6893           0 :             if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
    6894             :                                      nWidth, 1, eWrkType, 0, 0,
    6895           0 :                                      nullptr) != CE_None)
    6896             :             {
    6897           0 :                 CPLFree(pafData);
    6898           0 :                 return CE_Failure;
    6899             :             }
    6900             : 
    6901           0 :             for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6902             :             {
    6903           0 :                 if (bComplex)
    6904             :                 {
    6905           0 :                     pafData[static_cast<size_t>(iPixel) * 2] *=
    6906           0 :                         static_cast<float>(dfGain);
    6907           0 :                     pafData[static_cast<size_t>(iPixel) * 2 + 1] *=
    6908           0 :                         static_cast<float>(dfGain);
    6909             :                 }
    6910             :                 else
    6911             :                 {
    6912           0 :                     pafData[iPixel] = static_cast<float>(
    6913           0 :                         (double(pafData[iPixel]) - dfOverviewMean) * dfGain +
    6914             :                         dfOrigMean);
    6915             :                 }
    6916             :             }
    6917             : 
    6918           0 :             if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
    6919             :                                      nWidth, 1, eWrkType, 0, 0,
    6920           0 :                                      nullptr) != CE_None)
    6921             :             {
    6922           0 :                 CPLFree(pafData);
    6923           0 :                 return CE_Failure;
    6924             :             }
    6925             :         }
    6926             : 
    6927           0 :         if (!pfnProgress(1.0, nullptr, pProgressData))
    6928             :         {
    6929           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6930           0 :             CPLFree(pafData);
    6931           0 :             return CE_Failure;
    6932             :         }
    6933             : 
    6934           0 :         CPLFree(pafData);
    6935             :     }
    6936             : 
    6937           0 :     return CE_None;
    6938             : }

Generated by: LCOV version 1.14