LCOV - code coverage report
Current view: top level - gcore - overview.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2423 2792 86.8 %
Date: 2024-11-21 22:18:42 Functions: 110 135 81.5 %

          Line data    Source code
       1             : 
       2             : /******************************************************************************
       3             :  *
       4             :  * Project:  GDAL Core
       5             :  * Purpose:  Helper code to implement overview support in different drivers.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2000, Frank Warmerdam
      10             :  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal_priv.h"
      17             : 
      18             : #include <cmath>
      19             : #include <cstddef>
      20             : #include <cstdlib>
      21             : 
      22             : #include <algorithm>
      23             : #include <complex>
      24             : #include <condition_variable>
      25             : #include <limits>
      26             : #include <list>
      27             : #include <memory>
      28             : #include <mutex>
      29             : #include <vector>
      30             : 
      31             : #include "cpl_conv.h"
      32             : #include "cpl_error.h"
      33             : #include "cpl_progress.h"
      34             : #include "cpl_vsi.h"
      35             : #include "gdal.h"
      36             : #include "gdal_thread_pool.h"
      37             : #include "gdalwarper.h"
      38             : 
      39             : #ifdef USE_NEON_OPTIMIZATIONS
      40             : #include "include_sse2neon.h"
      41             : #define USE_SSE2
      42             : 
      43             : #include "gdalsse_priv.h"
      44             : 
      45             : // Restrict to 64bit processors because they are guaranteed to have SSE2,
      46             : // or if __AVX2__ is defined.
      47             : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
      48             : #define USE_SSE2
      49             : 
      50             : #include "gdalsse_priv.h"
      51             : 
      52             : #ifdef __SSE3__
      53             : #include <pmmintrin.h>
      54             : #endif
      55             : #ifdef __SSSE3__
      56             : #include <tmmintrin.h>
      57             : #endif
      58             : #ifdef __SSE4_1__
      59             : #include <smmintrin.h>
      60             : #endif
      61             : #ifdef __AVX2__
      62             : #include <immintrin.h>
      63             : #endif
      64             : 
      65             : #endif
      66             : 
      67             : // To be included after above USE_SSE2 and include gdalsse_priv.h
      68             : // to avoid build issue on Windows x86
      69             : #include "gdal_priv_templates.hpp"
      70             : 
      71             : /************************************************************************/
      72             : /*                      GDALResampleChunk_Near()                        */
      73             : /************************************************************************/
      74             : 
      75             : template <class T>
      76        6034 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
      77             :                                       const T *pChunk, T **ppDstBuffer)
      78             : 
      79             : {
      80        6034 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
      81        6034 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
      82        6034 :     const GDALDataType eWrkDataType = args.eWrkDataType;
      83        6034 :     const int nChunkXOff = args.nChunkXOff;
      84        6034 :     const int nChunkXSize = args.nChunkXSize;
      85        6034 :     const int nChunkYOff = args.nChunkYOff;
      86        6034 :     const int nDstXOff = args.nDstXOff;
      87        6034 :     const int nDstXOff2 = args.nDstXOff2;
      88        6034 :     const int nDstYOff = args.nDstYOff;
      89        6034 :     const int nDstYOff2 = args.nDstYOff2;
      90        6034 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
      91             : 
      92             :     /* -------------------------------------------------------------------- */
      93             :     /*      Allocate buffers.                                               */
      94             :     /* -------------------------------------------------------------------- */
      95        6034 :     *ppDstBuffer = static_cast<T *>(
      96        6034 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
      97             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
      98        6034 :     if (*ppDstBuffer == nullptr)
      99             :     {
     100           0 :         return CE_Failure;
     101             :     }
     102        6034 :     T *const pDstBuffer = *ppDstBuffer;
     103             : 
     104             :     int *panSrcXOff =
     105        6034 :         static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
     106             : 
     107        6034 :     if (panSrcXOff == nullptr)
     108             :     {
     109           0 :         VSIFree(panSrcXOff);
     110           0 :         return CE_Failure;
     111             :     }
     112             : 
     113             :     /* ==================================================================== */
     114             :     /*      Precompute inner loop constants.                                */
     115             :     /* ==================================================================== */
     116      572295 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
     117             :     {
     118      566261 :         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
     119      566261 :         if (nSrcXOff < nChunkXOff)
     120           0 :             nSrcXOff = nChunkXOff;
     121             : 
     122      566261 :         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
     123             :     }
     124             : 
     125             :     /* ==================================================================== */
     126             :     /*      Loop over destination scanlines.                                */
     127             :     /* ==================================================================== */
     128      210662 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
     129             :     {
     130      204628 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
     131      204628 :         if (nSrcYOff < nChunkYOff)
     132           0 :             nSrcYOff = nChunkYOff;
     133             : 
     134      204628 :         const T *const pSrcScanline =
     135             :             pChunk +
     136      204628 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
     137      202158 :             nChunkXOff;
     138             : 
     139             :         /* --------------------------------------------------------------------
     140             :          */
     141             :         /*      Loop over destination pixels */
     142             :         /* --------------------------------------------------------------------
     143             :          */
     144      204628 :         T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
     145   116297034 :         for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
     146             :         {
     147   116092564 :             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
     148             :         }
     149             :     }
     150             : 
     151        6034 :     CPLFree(panSrcXOff);
     152             : 
     153        6034 :     return CE_None;
     154             : }
     155             : 
     156        6034 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
     157             :                                      const void *pChunk, void **ppDstBuffer,
     158             :                                      GDALDataType *peDstBufferDataType)
     159             : {
     160        6034 :     *peDstBufferDataType = args.eWrkDataType;
     161        6034 :     switch (args.eWrkDataType)
     162             :     {
     163             :         // For nearest resampling, as no computation is done, only the
     164             :         // size of the data type matters.
     165        5906 :         case GDT_Byte:
     166             :         case GDT_Int8:
     167             :         {
     168        5906 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
     169        5906 :             return GDALResampleChunk_NearT(
     170             :                 args, static_cast<const uint8_t *>(pChunk),
     171        5906 :                 reinterpret_cast<uint8_t **>(ppDstBuffer));
     172             :         }
     173             : 
     174          26 :         case GDT_Int16:
     175             :         case GDT_UInt16:
     176             :         {
     177          26 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
     178          26 :             return GDALResampleChunk_NearT(
     179             :                 args, static_cast<const uint16_t *>(pChunk),
     180          26 :                 reinterpret_cast<uint16_t **>(ppDstBuffer));
     181             :         }
     182             : 
     183          55 :         case GDT_CInt16:
     184             :         case GDT_Int32:
     185             :         case GDT_UInt32:
     186             :         case GDT_Float32:
     187             :         {
     188          55 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
     189          55 :             return GDALResampleChunk_NearT(
     190             :                 args, static_cast<const uint32_t *>(pChunk),
     191          55 :                 reinterpret_cast<uint32_t **>(ppDstBuffer));
     192             :         }
     193             : 
     194          43 :         case GDT_CInt32:
     195             :         case GDT_CFloat32:
     196             :         case GDT_Int64:
     197             :         case GDT_UInt64:
     198             :         case GDT_Float64:
     199             :         {
     200          43 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
     201          43 :             return GDALResampleChunk_NearT(
     202             :                 args, static_cast<const uint64_t *>(pChunk),
     203          43 :                 reinterpret_cast<uint64_t **>(ppDstBuffer));
     204             :         }
     205             : 
     206           4 :         case GDT_CFloat64:
     207             :         {
     208           4 :             return GDALResampleChunk_NearT(
     209             :                 args, static_cast<const std::complex<double> *>(pChunk),
     210           4 :                 reinterpret_cast<std::complex<double> **>(ppDstBuffer));
     211             :         }
     212             : 
     213           0 :         case GDT_Unknown:
     214             :         case GDT_TypeCount:
     215           0 :             break;
     216             :     }
     217           0 :     CPLAssert(false);
     218             :     return CE_Failure;
     219             : }
     220             : 
     221             : namespace
     222             : {
     223             : 
     224             : // Find in the color table the entry whose RGB value is the closest
     225             : // (using quadratic distance) to the test color, ignoring transparent entries.
     226        3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
     227             :                    const GDALColorEntry &test)
     228             : {
     229        3837 :     int nMinDist = std::numeric_limits<int>::max();
     230        3837 :     size_t bestEntry = 0;
     231      986109 :     for (size_t i = 0; i < entries.size(); ++i)
     232             :     {
     233      982272 :         const GDALColorEntry &entry = entries[i];
     234             :         // Ignore transparent entries
     235      982272 :         if (entry.c4 == 0)
     236        3237 :             continue;
     237             : 
     238      979035 :         int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
     239      979035 :                     ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
     240      979035 :                     ((test.c3 - entry.c3) * (test.c3 - entry.c3));
     241      979035 :         if (nDist < nMinDist)
     242             :         {
     243       15847 :             nMinDist = nDist;
     244       15847 :             bestEntry = i;
     245             :         }
     246             :     }
     247        3837 :     return static_cast<int>(bestEntry);
     248             : }
     249             : 
     250           7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
     251             :                                            int &transparentIdx)
     252             : {
     253           7 :     std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
     254             : 
     255           7 :     transparentIdx = -1;
     256           7 :     int i = 0;
     257        1799 :     for (auto &entry : entries)
     258             :     {
     259        1792 :         table.GetColorEntryAsRGB(i, &entry);
     260        1792 :         if (transparentIdx < 0 && entry.c4 == 0)
     261           1 :             transparentIdx = i;
     262        1792 :         ++i;
     263             :     }
     264           7 :     return entries;
     265             : }
     266             : 
     267             : }  // unnamed  namespace
     268             : 
     269             : /************************************************************************/
     270             : /*                             SQUARE()                                 */
     271             : /************************************************************************/
     272             : 
     273        3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
     274             : {
     275        3721 :     return static_cast<Tsquare>(val) * val;
     276             : }
     277             : 
     278             : /************************************************************************/
     279             : /*                          ComputeIntegerRMS()                         */
     280             : /************************************************************************/
     281             : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
     282             : // integer that minimizes abs(rms**2 - sumSquares / weight)
     283             : template <class T, class Twork>
     284          42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
     285             : {
     286          42 :     const double sumDivWeight = sumSquares / weight;
     287          42 :     T rms = static_cast<T>(sqrt(sumDivWeight));
     288             : 
     289             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     290             :     // Naive version:
     291             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     292          42 :     if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
     293          42 :         2 * sumDivWeight)
     294           6 :         rms += 1;
     295          42 :     return rms;
     296             : }
     297             : 
     298           0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
     299             : {
     300           0 :     CPLAssert(false);
     301             :     return 0;
     302             : }
     303             : 
     304          24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
     305             : {
     306             :     // It has been verified that given the correction on rms below, using
     307             :     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
     308             :     // is equivalent, so use the former as it is used twice.
     309          24 :     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
     310          24 :     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
     311          24 :     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
     312             : 
     313             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     314             :     // Naive version:
     315             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     316             :     // Optimized version for integer case and weight == 4
     317          24 :     if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
     318           5 :         rms += 1;
     319          24 :     return rms;
     320             : }
     321             : 
     322             : template <>
     323          20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
     324             : {
     325          20 :     const double sumDivWeight = sumSquares * 0.25;
     326          20 :     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
     327             : 
     328             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     329             :     // Naive version:
     330             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     331             :     // Optimized version for integer case and weight == 4
     332          20 :     if (static_cast<GUInt32>(rms) * (rms + 1) <
     333          20 :         static_cast<GUInt32>(sumDivWeight + 0.25))
     334           4 :         rms += 1;
     335          20 :     return rms;
     336             : }
     337             : 
     338             : #ifdef USE_SSE2
     339             : 
     340             : /************************************************************************/
     341             : /*                   QuadraticMeanByteSSE2OrAVX2()                      */
     342             : /************************************************************************/
     343             : 
     344             : #if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
     345             : #define sse2_packus_epi32 _mm_packus_epi32
     346             : #else
     347      516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
     348             : {
     349      516119 :     const auto minus32768_32 = _mm_set1_epi32(-32768);
     350      516119 :     const auto minus32768_16 = _mm_set1_epi16(-32768);
     351      516119 :     a = _mm_add_epi32(a, minus32768_32);
     352      516119 :     b = _mm_add_epi32(b, minus32768_32);
     353      516119 :     a = _mm_packs_epi32(a, b);
     354      516119 :     a = _mm_sub_epi16(a, minus32768_16);
     355      516119 :     return a;
     356             : }
     357             : #endif
     358             : 
     359             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     360             : #define sse2_hadd_epi16 _mm_hadd_epi16
     361             : #else
     362     4660800 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
     363             : {
     364             :     // Horizontal addition of adjacent pairs
     365     4660800 :     const auto mask = _mm_set1_epi32(0xFFFF);
     366             :     const auto horizLo =
     367    13982400 :         _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
     368             :     const auto horizHi =
     369    13982400 :         _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
     370             : 
     371             :     // Recombine low and high parts
     372     4660800 :     return _mm_packs_epi32(horizLo, horizHi);
     373             : }
     374             : #endif
     375             : 
     376             : #ifdef __AVX2__
     377             : 
     378             : #define DEST_ELTS 16
     379             : #define set1_epi16 _mm256_set1_epi16
     380             : #define set1_epi32 _mm256_set1_epi32
     381             : #define setzero _mm256_setzero_si256
     382             : #define set1_ps _mm256_set1_ps
     383             : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
     384             : #define unpacklo_epi8 _mm256_unpacklo_epi8
     385             : #define unpackhi_epi8 _mm256_unpackhi_epi8
     386             : #define madd_epi16 _mm256_madd_epi16
     387             : #define add_epi32 _mm256_add_epi32
     388             : #define mul_ps _mm256_mul_ps
     389             : #define cvtepi32_ps _mm256_cvtepi32_ps
     390             : #define sqrt_ps _mm256_sqrt_ps
     391             : #define cvttps_epi32 _mm256_cvttps_epi32
     392             : #define packs_epi32 _mm256_packs_epi32
     393             : #define packus_epi32 _mm256_packus_epi32
     394             : #define srli_epi32 _mm256_srli_epi32
     395             : #define mullo_epi16 _mm256_mullo_epi16
     396             : #define srli_epi16 _mm256_srli_epi16
     397             : #define cmpgt_epi16 _mm256_cmpgt_epi16
     398             : #define add_epi16 _mm256_add_epi16
     399             : #define sub_epi16 _mm256_sub_epi16
     400             : #define packus_epi16 _mm256_packus_epi16
     401             : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
     402             : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
     403             :  */
     404             : #define store_lo(x, y)                                                         \
     405             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
     406             :                      _mm256_extracti128_si256(                                 \
     407             :                          _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
     408             : #define hadd_epi16 _mm256_hadd_epi16
     409             : #define zeroupper() _mm256_zeroupper()
     410             : #else
     411             : #define DEST_ELTS 8
     412             : #define set1_epi16 _mm_set1_epi16
     413             : #define set1_epi32 _mm_set1_epi32
     414             : #define setzero _mm_setzero_si128
     415             : #define set1_ps _mm_set1_ps
     416             : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
     417             : #define unpacklo_epi8 _mm_unpacklo_epi8
     418             : #define unpackhi_epi8 _mm_unpackhi_epi8
     419             : #define madd_epi16 _mm_madd_epi16
     420             : #define add_epi32 _mm_add_epi32
     421             : #define mul_ps _mm_mul_ps
     422             : #define cvtepi32_ps _mm_cvtepi32_ps
     423             : #define sqrt_ps _mm_sqrt_ps
     424             : #define cvttps_epi32 _mm_cvttps_epi32
     425             : #define packs_epi32 _mm_packs_epi32
     426             : #define packus_epi32 sse2_packus_epi32
     427             : #define srli_epi32 _mm_srli_epi32
     428             : #define mullo_epi16 _mm_mullo_epi16
     429             : #define srli_epi16 _mm_srli_epi16
     430             : #define cmpgt_epi16 _mm_cmpgt_epi16
     431             : #define add_epi16 _mm_add_epi16
     432             : #define sub_epi16 _mm_sub_epi16
     433             : #define packus_epi16 _mm_packus_epi16
     434             : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
     435             : #define hadd_epi16 sse2_hadd_epi16
     436             : #define zeroupper() (void)0
     437             : #endif
     438             : 
     439             : #if defined(__GNUC__) && defined(__AVX2__)
     440             : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
     441             : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
     442             : // where the registry that contains minus_zero is correctly
     443             : // loaded the first time the function is called (looking at the disassembly,
     444             : // one sees it is loaded much earlier than the function), but gets corrupted
     445             : // (zeroed) in following iterations.
     446             : // It appears the bug is due to the explicit zeroupper() call at the end of
     447             : // the function.
     448             : // The bug is at least solved in gcc 10.2.
     449             : // Inlining doesn't bring much here to performance.
     450             : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
     451             : // -O3 -mavx2 mode
     452             : #define NOINLINE __attribute__((noinline))
     453             : #else
     454             : #define NOINLINE
     455             : #endif
     456             : 
     457             : template <class T>
     458             : static int NOINLINE
     459        5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     460             :                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     461             :                             T *CPL_RESTRICT pDstScanline)
     462             : {
     463             :     // Optimized implementation for RMS on Byte by
     464             :     // processing by group of 8 output pixels, so as to use
     465             :     // a single _mm_sqrt_ps() call for 4 output pixels
     466        5385 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     467             : 
     468        5385 :     int iDstPixel = 0;
     469        5385 :     const auto one16 = set1_epi16(1);
     470        5385 :     const auto one32 = set1_epi32(1);
     471        5385 :     const auto zero = setzero();
     472        5385 :     const auto minus32768 = set1_epi16(-32768);
     473             : 
     474      521496 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     475             :     {
     476             :         // Load 2 * DEST_ELTS bytes from each line
     477      516111 :         auto firstLine = loadu_int(pSrcScanlineShifted);
     478     1032220 :         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     479             :         // Extend those Bytes as UInt16s
     480      516111 :         auto firstLineLo = unpacklo_epi8(firstLine, zero);
     481      516111 :         auto firstLineHi = unpackhi_epi8(firstLine, zero);
     482      516111 :         auto secondLineLo = unpacklo_epi8(secondLine, zero);
     483      516111 :         auto secondLineHi = unpackhi_epi8(secondLine, zero);
     484             : 
     485             :         // Multiplication of 16 bit values and horizontal
     486             :         // addition of 32 bit results
     487             :         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
     488      516111 :         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
     489      516111 :         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
     490      516111 :         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
     491      516111 :         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
     492             : 
     493             :         // Vertical addition
     494      516111 :         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
     495      516111 :         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
     496             : 
     497             :         const auto sumSquaresPlusOneDiv4Lo =
     498     1032220 :             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
     499             :         const auto sumSquaresPlusOneDiv4Hi =
     500     1032220 :             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
     501             : 
     502             :         // Take square root and truncate/floor to int32
     503             :         const auto rmsLo =
     504     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
     505             :         const auto rmsHi =
     506     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
     507             : 
     508             :         // Merge back low and high registers with each RMS value
     509             :         // as a 16 bit value.
     510      516111 :         auto rms = packs_epi32(rmsLo, rmsHi);
     511             : 
     512             :         // Round to upper value if it minimizes the
     513             :         // error |rms^2 - sumSquares/4|
     514             :         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     515             :         //    rms += 1;
     516             :         // which is equivalent to:
     517             :         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
     518             :         //    rms += 1;
     519             :         // And both left and right parts fit on 16 (unsigned) bits
     520             :         const auto sumSquaresPlusOneDiv4 =
     521      516111 :             packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
     522             :         // cmpgt_epi16 operates on signed int16, but here
     523             :         // we have unsigned values, so shift them by -32768 before
     524     2580560 :         auto mask = cmpgt_epi16(
     525             :             add_epi16(sumSquaresPlusOneDiv4, minus32768),
     526             :             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
     527             :         // The value of the mask will be -1 when the correction needs to be
     528             :         // applied
     529      516111 :         rms = sub_epi16(rms, mask);
     530             : 
     531             :         // Pack each 16 bit RMS value to 8 bits
     532      516111 :         rms = packus_epi16(rms, rms /* could be anything */);
     533      516111 :         store_lo(&pDstScanline[iDstPixel], rms);
     534      516111 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     535             :     }
     536             :     zeroupper();
     537             : 
     538        5385 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     539        5385 :     return iDstPixel;
     540             : }
     541             : 
     542             : /************************************************************************/
     543             : /*                      AverageByteSSE2OrAVX2()                         */
     544             : /************************************************************************/
     545             : 
     546             : template <class T>
     547             : static int
     548      110996 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     549             :                       const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     550             :                       T *CPL_RESTRICT pDstScanline)
     551             : {
     552             :     // Optimized implementation for average on Byte by
     553             :     // processing by group of 8 output pixels.
     554             : 
     555      110996 :     const auto zero = setzero();
     556      110996 :     const auto two16 = set1_epi16(2);
     557      110996 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     558             : 
     559      110996 :     int iDstPixel = 0;
     560     4771800 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     561             :     {
     562             :         // Load 2 * DEST_ELTS bytes from each line
     563     4660800 :         const auto firstLine = loadu_int(pSrcScanlineShifted);
     564     9321610 :         const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     565             :         // Extend those Bytes as UInt16s
     566     4660800 :         const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     567     4660800 :         const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     568     4660800 :         const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     569     4660800 :         const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     570             : 
     571             :         // Vertical addition
     572     4660800 :         const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     573     4660800 :         const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     574             : 
     575             :         // Horizontal addition of adjacent pairs, and recombine low and high
     576             :         // parts
     577     4660800 :         const auto sum = hadd_epi16(sumLo, sumHi);
     578             : 
     579             :         // average = (sum + 2) / 4
     580     9321610 :         auto average = srli_epi16(add_epi16(sum, two16), 2);
     581             : 
     582             :         // Pack each 16 bit average value to 8 bits
     583     4660800 :         average = packus_epi16(average, average /* could be anything */);
     584     4660800 :         store_lo(&pDstScanline[iDstPixel], average);
     585     4660800 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     586             :     }
     587             :     zeroupper();
     588             : 
     589      110996 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     590      110996 :     return iDstPixel;
     591             : }
     592             : 
     593             : /************************************************************************/
     594             : /*                     QuadraticMeanUInt16SSE2()                        */
     595             : /************************************************************************/
     596             : 
     597             : #ifdef __SSE3__
     598             : #define sse2_hadd_pd _mm_hadd_pd
     599             : #else
     600           8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
     601             : {
     602             :     auto aLo_bLo =
     603          32 :         _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
     604             :     auto aHi_bHi =
     605          32 :         _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
     606           8 :     return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
     607             : }
     608             : #endif
     609             : 
     610          40 : inline __m128d SQUARE(__m128d x)
     611             : {
     612          40 :     return _mm_mul_pd(x, x);
     613             : }
     614             : 
     615             : #ifdef __AVX2__
     616             : 
     617             : inline __m256d SQUARE(__m256d x)
     618             : {
     619             :     return _mm256_mul_pd(x, x);
     620             : }
     621             : 
     622             : inline __m256d FIXUP_LANES(__m256d x)
     623             : {
     624             :     return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
     625             : }
     626             : 
     627             : inline __m256 FIXUP_LANES(__m256 x)
     628             : {
     629             :     return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
     630             : }
     631             : 
     632             : #endif
     633             : 
     634             : template <class T>
     635             : static int
     636          10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
     637             :                         const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     638             :                         T *CPL_RESTRICT pDstScanline)
     639             : {
     640             :     // Optimized implementation for RMS on UInt16 by
     641             :     // processing by group of 4 output pixels.
     642          10 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     643             : 
     644          10 :     int iDstPixel = 0;
     645          10 :     const auto zero = _mm_setzero_si128();
     646             : 
     647             : #ifdef __AVX2__
     648             :     const auto zeroDot25 = _mm256_set1_pd(0.25);
     649             :     const auto zeroDot5 = _mm256_set1_pd(0.5);
     650             : 
     651             :     // The first four 0's could be anything, as we only take the bottom
     652             :     // 128 bits.
     653             :     const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
     654             : #else
     655          10 :     const auto zeroDot25 = _mm_set1_pd(0.25);
     656          10 :     const auto zeroDot5 = _mm_set1_pd(0.5);
     657             : #endif
     658             : 
     659          40 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
     660             :     {
     661             :         // Load 8 UInt16 from each line
     662          30 :         const auto firstLine = _mm_loadu_si128(
     663             :             reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     664             :         const auto secondLine =
     665          30 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     666          30 :                 pSrcScanlineShifted + nChunkXSize));
     667             : 
     668             :         // Detect if all of the source values fit in 14 bits.
     669             :         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
     670             :         // and we can do a much faster implementation.
     671             :         const auto maskTmp =
     672          60 :             _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
     673             : #if defined(__i386__) || defined(_M_IX86)
     674             :         uint64_t nMaskFitsIn14Bits = 0;
     675             :         _mm_storel_epi64(
     676             :             reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
     677             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     678             : #else
     679          30 :         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
     680             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     681             : #endif
     682          30 :         if (nMaskFitsIn14Bits == 0)
     683             :         {
     684             :             // Multiplication of 16 bit values and horizontal
     685             :             // addition of 32 bit results
     686             :             const auto firstLineHSumSquare =
     687          26 :                 _mm_madd_epi16(firstLine, firstLine);
     688             :             const auto secondLineHSumSquare =
     689          26 :                 _mm_madd_epi16(secondLine, secondLine);
     690             :             // Vertical addition
     691             :             const auto sumSquares =
     692          26 :                 _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
     693             :             // In theory we should take sqrt(sumSquares * 0.25f)
     694             :             // but given the rounding we do, this is equivalent to
     695             :             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
     696             :             // sumSquares <= 4 * 16383^2
     697          26 :             const auto one32 = _mm_set1_epi32(1);
     698             :             const auto sumSquaresPlusOneDiv4 =
     699          52 :                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
     700             :             // Take square root and truncate/floor to int32
     701          78 :             auto rms = _mm_cvttps_epi32(
     702             :                 _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
     703             : 
     704             :             // Round to upper value if it minimizes the
     705             :             // error |rms^2 - sumSquares/4|
     706             :             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     707             :             //    rms += 1;
     708             :             // which is equivalent to:
     709             :             // if( rms * rms + rms < (sumSquares+1) / 4 )
     710             :             //    rms += 1;
     711             :             auto mask =
     712          78 :                 _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
     713             :                                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
     714          26 :             rms = _mm_sub_epi32(rms, mask);
     715             :             // Pack each 32 bit RMS value to 16 bits
     716          26 :             rms = _mm_packs_epi32(rms, rms /* could be anything */);
     717             :             _mm_storel_epi64(
     718          26 :                 reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
     719          26 :             pSrcScanlineShifted += 8;
     720          26 :             continue;
     721             :         }
     722             : 
     723             :         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
     724             :         // to 32 bit would result in 4 multiplications instead of 8, but
     725             :         // mullo/mulhi have a worse throughput than mul_pd.
     726             : 
     727             :         // Extend those UInt16s as UInt32s
     728           4 :         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
     729           4 :         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
     730           4 :         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
     731           4 :         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
     732             : 
     733             : #ifdef __AVX2__
     734             :         // Multiplication of 32 bit values previously converted to 64 bit double
     735             :         const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
     736             :         const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
     737             :         const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
     738             :         const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
     739             : 
     740             :         // Vertical addition of squares
     741             :         const auto sumSquaresLo =
     742             :             _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
     743             :         const auto sumSquaresHi =
     744             :             _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
     745             : 
     746             :         // Horizontal addition of squares
     747             :         const auto sumSquares =
     748             :             FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
     749             : 
     750             :         const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
     751             : 
     752             :         // Take square root and truncate/floor to int32
     753             :         auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
     754             :         const auto rmsDouble = _mm256_cvtepi32_pd(rms);
     755             :         const auto right = _mm256_sub_pd(
     756             :             sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
     757             : 
     758             :         auto mask =
     759             :             _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
     760             :         // Extract 32-bit from each of the 4 64-bit masks
     761             :         // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
     762             :         // _MM_SHUFFLE(2,0,2,0)));
     763             :         mask = _mm256_permutevar8x32_ps(mask, permutation);
     764             :         const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
     765             : 
     766             :         // Apply the correction
     767             :         rms = _mm_sub_epi32(rms, maskI);
     768             : 
     769             :         // Pack each 32 bit RMS value to 16 bits
     770             :         rms = _mm_packus_epi32(rms, rms /* could be anything */);
     771             : #else
     772             :         // Multiplication of 32 bit values previously converted to 64 bit double
     773           4 :         const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
     774             :         const auto firstLineLoHi =
     775           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
     776           4 :         const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
     777             :         const auto firstLineHiHi =
     778           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
     779             : 
     780           4 :         const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
     781             :         const auto secondLineLoHi =
     782           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
     783           4 :         const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
     784             :         const auto secondLineHiHi =
     785           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
     786             : 
     787             :         // Vertical addition of squares
     788           4 :         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
     789           4 :         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
     790           4 :         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
     791           4 :         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
     792             : 
     793             :         // Horizontal addition of squares
     794           4 :         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
     795           4 :         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
     796             : 
     797           4 :         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
     798           4 :         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
     799             :         // Take square root and truncate/floor to int32
     800           8 :         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
     801           8 :         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
     802             : 
     803             :         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
     804             :         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
     805             :         //     rms += 1;
     806           4 :         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
     807           4 :         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
     808           8 :         const auto rightLo = _mm_sub_pd(
     809             :             sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
     810          12 :         const auto rightHi = _mm_sub_pd(
     811             :             sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
     812             : 
     813           8 :         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
     814           4 :         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
     815             :         // The value of the mask will be -1 when the correction needs to be
     816             :         // applied
     817           8 :         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
     818             :             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
     819             : 
     820          16 :         auto rms = _mm_castps_si128(
     821             :             _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
     822             :         // Apply the correction
     823           4 :         rms = _mm_sub_epi32(rms, mask);
     824             : 
     825             :         // Pack each 32 bit RMS value to 16 bits
     826           4 :         rms = sse2_packus_epi32(rms, rms /* could be anything */);
     827             : #endif
     828             : 
     829           4 :         _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     830             :                          rms);
     831           4 :         pSrcScanlineShifted += 8;
     832             :     }
     833             : 
     834             :     zeroupper();
     835             : 
     836          10 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     837          10 :     return iDstPixel;
     838             : }
     839             : 
     840             : /************************************************************************/
     841             : /*                         AverageUInt16SSE2()                          */
     842             : /************************************************************************/
     843             : 
     844             : template <class T>
     845           9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
     846             :                              const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     847             :                              T *CPL_RESTRICT pDstScanline)
     848             : {
     849             :     // Optimized implementation for average on UInt16 by
     850             :     // processing by group of 8 output pixels.
     851             : 
     852           9 :     const auto mask = _mm_set1_epi32(0xFFFF);
     853           9 :     const auto two = _mm_set1_epi32(2);
     854           9 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     855             : 
     856           9 :     int iDstPixel = 0;
     857          13 :     for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
     858             :     {
     859             :         __m128i averageLow;
     860             :         // Load 8 UInt16 from each line
     861             :         {
     862           4 :             const auto firstLine = _mm_loadu_si128(
     863             :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     864             :             const auto secondLine =
     865           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     866           4 :                     pSrcScanlineShifted + nChunkXSize));
     867             : 
     868             :             // Horizontal addition and extension to 32 bit
     869          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     870             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     871             :             const auto horizAddSecondLine =
     872          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     873             :                               _mm_srli_epi32(secondLine, 16));
     874             : 
     875             :             // Vertical addition and average computation
     876             :             // average = (sum + 2) >> 2
     877           8 :             const auto sum = _mm_add_epi32(
     878             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     879           4 :             averageLow = _mm_srli_epi32(sum, 2);
     880             :         }
     881             :         // Load 8 UInt16 from each line
     882             :         __m128i averageHigh;
     883             :         {
     884           4 :             const auto firstLine = _mm_loadu_si128(
     885           4 :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
     886             :             const auto secondLine =
     887           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     888           4 :                     pSrcScanlineShifted + 8 + nChunkXSize));
     889             : 
     890             :             // Horizontal addition and extension to 32 bit
     891          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     892             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     893             :             const auto horizAddSecondLine =
     894          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     895             :                               _mm_srli_epi32(secondLine, 16));
     896             : 
     897             :             // Vertical addition and average computation
     898             :             // average = (sum + 2) >> 2
     899           8 :             const auto sum = _mm_add_epi32(
     900             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     901           4 :             averageHigh = _mm_srli_epi32(sum, 2);
     902             :         }
     903             : 
     904             :         // Pack each 32 bit average value to 16 bits
     905           4 :         auto average = sse2_packus_epi32(averageLow, averageHigh);
     906           4 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     907             :                          average);
     908           4 :         pSrcScanlineShifted += 16;
     909             :     }
     910             : 
     911           9 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     912           9 :     return iDstPixel;
     913             : }
     914             : 
     915             : /************************************************************************/
     916             : /*                      QuadraticMeanFloatSSE2()                        */
     917             : /************************************************************************/
     918             : 
     919             : #ifdef __AVX2__
     920             : #define RMS_FLOAT_ELTS 8
     921             : #define set1_ps _mm256_set1_ps
     922             : #define loadu_ps _mm256_loadu_ps
     923             : #define andnot_ps _mm256_andnot_ps
     924             : #define and_ps _mm256_and_ps
     925             : #define max_ps _mm256_max_ps
     926             : #define shuffle_ps _mm256_shuffle_ps
     927             : #define div_ps _mm256_div_ps
     928             : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
     929             : #define mul_ps _mm256_mul_ps
     930             : #define add_ps _mm256_add_ps
     931             : #define hadd_ps _mm256_hadd_ps
     932             : #define sqrt_ps _mm256_sqrt_ps
     933             : #define or_ps _mm256_or_ps
     934             : #define unpacklo_ps _mm256_unpacklo_ps
     935             : #define unpackhi_ps _mm256_unpackhi_ps
     936             : #define storeu_ps _mm256_storeu_ps
     937             : 
     938             : inline __m256 SQUARE(__m256 x)
     939             : {
     940             :     return _mm256_mul_ps(x, x);
     941             : }
     942             : 
     943             : #else
     944             : 
     945             : #ifdef __SSE3__
     946             : #define sse2_hadd_ps _mm_hadd_ps
     947             : #else
     948             : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
     949             : {
     950             :     auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
     951             :     auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
     952             :     return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
     953             : }
     954             : #endif
     955             : 
     956             : #define RMS_FLOAT_ELTS 4
     957             : #define set1_ps _mm_set1_ps
     958             : #define loadu_ps _mm_loadu_ps
     959             : #define andnot_ps _mm_andnot_ps
     960             : #define and_ps _mm_and_ps
     961             : #define max_ps _mm_max_ps
     962             : #define shuffle_ps _mm_shuffle_ps
     963             : #define div_ps _mm_div_ps
     964             : #define cmpeq_ps _mm_cmpeq_ps
     965             : #define mul_ps _mm_mul_ps
     966             : #define add_ps _mm_add_ps
     967             : #define hadd_ps sse2_hadd_ps
     968             : #define sqrt_ps _mm_sqrt_ps
     969             : #define or_ps _mm_or_ps
     970             : #define unpacklo_ps _mm_unpacklo_ps
     971             : #define unpackhi_ps _mm_unpackhi_ps
     972             : #define storeu_ps _mm_storeu_ps
     973             : 
     974         272 : inline __m128 SQUARE(__m128 x)
     975             : {
     976         272 :     return _mm_mul_ps(x, x);
     977             : }
     978             : 
     979          68 : inline __m128 FIXUP_LANES(__m128 x)
     980             : {
     981          68 :     return x;
     982             : }
     983             : 
     984             : #endif
     985             : 
     986             : template <class T>
     987             : static int NOINLINE
     988          34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
     989             :                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     990             :                        T *CPL_RESTRICT pDstScanline)
     991             : {
     992             :     // Optimized implementation for RMS on Float32 by
     993             :     // processing by group of RMS_FLOAT_ELTS output pixels.
     994          34 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     995             : 
     996          34 :     int iDstPixel = 0;
     997          34 :     const auto minus_zero = set1_ps(-0.0f);
     998          34 :     const auto zeroDot25 = set1_ps(0.25f);
     999          34 :     const auto one = set1_ps(1.0f);
    1000          68 :     const auto infv = set1_ps(std::numeric_limits<float>::infinity());
    1001             : 
    1002         102 :     for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
    1003             :          iDstPixel += RMS_FLOAT_ELTS)
    1004             :     {
    1005             :         // Load 2*RMS_FLOAT_ELTS Float32 from each line
    1006             :         auto firstLineLo =
    1007          68 :             loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
    1008          68 :         auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
    1009          68 :             pSrcScanlineShifted + RMS_FLOAT_ELTS));
    1010          68 :         auto secondLineLo = loadu_ps(
    1011          68 :             reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
    1012          68 :         auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
    1013          68 :             pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
    1014             : 
    1015             :         // Take the absolute value
    1016          68 :         firstLineLo = andnot_ps(minus_zero, firstLineLo);
    1017          68 :         firstLineHi = andnot_ps(minus_zero, firstLineHi);
    1018          68 :         secondLineLo = andnot_ps(minus_zero, secondLineLo);
    1019          68 :         secondLineHi = andnot_ps(minus_zero, secondLineHi);
    1020             : 
    1021             :         auto firstLineEven =
    1022          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1023             :         auto firstLineOdd =
    1024          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1025             :         auto secondLineEven =
    1026          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1027             :         auto secondLineOdd =
    1028          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1029             : 
    1030             :         // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
    1031         204 :         const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
    1032             :                                  max_ps(secondLineEven, secondLineEven));
    1033             : 
    1034             :         // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
    1035             :         // This step is important to avoid that the square evaluates to infinity
    1036             :         // for sufficiently big input.
    1037          68 :         auto invMax = div_ps(one, maxV);
    1038             :         // Deal with 0 being the maximum to correct division by zero
    1039             :         // note: comparing to -0 leads to identical results as to comparing with
    1040             :         // 0
    1041         136 :         invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
    1042             : 
    1043          68 :         firstLineEven = mul_ps(firstLineEven, invMax);
    1044          68 :         firstLineOdd = mul_ps(firstLineOdd, invMax);
    1045          68 :         secondLineEven = mul_ps(secondLineEven, invMax);
    1046          68 :         secondLineOdd = mul_ps(secondLineOdd, invMax);
    1047             : 
    1048             :         // Compute squares
    1049          68 :         firstLineEven = SQUARE(firstLineEven);
    1050          68 :         firstLineOdd = SQUARE(firstLineOdd);
    1051          68 :         secondLineEven = SQUARE(secondLineEven);
    1052          68 :         secondLineOdd = SQUARE(secondLineOdd);
    1053             : 
    1054         204 :         const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
    1055             :                                        add_ps(secondLineEven, secondLineOdd));
    1056             : 
    1057         204 :         auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
    1058             : 
    1059             :         // Deal with infinity being the maximum
    1060          68 :         const auto maskIsInf = cmpeq_ps(maxV, infv);
    1061         136 :         rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
    1062             : 
    1063          68 :         rms = FIXUP_LANES(rms);
    1064             : 
    1065             :         // coverity[incompatible_cast]
    1066          68 :         storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
    1067          68 :         pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
    1068             :     }
    1069             : 
    1070             :     zeroupper();
    1071             : 
    1072          34 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1073          34 :     return iDstPixel;
    1074             : }
    1075             : 
    1076             : /************************************************************************/
    1077             : /*                        AverageFloatSSE2()                            */
    1078             : /************************************************************************/
    1079             : 
    1080             : template <class T>
    1081          14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
    1082             :                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1083             :                             T *CPL_RESTRICT pDstScanline)
    1084             : {
    1085             :     // Optimized implementation for average on Float32 by
    1086             :     // processing by group of 4 output pixels.
    1087          14 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1088             : 
    1089          14 :     int iDstPixel = 0;
    1090          14 :     const auto zeroDot25 = _mm_set1_ps(0.25f);
    1091             : 
    1092          32 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
    1093             :     {
    1094             :         // Load 8 Float32 from each line
    1095             :         const auto firstLineLo =
    1096          18 :             _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
    1097          18 :         const auto firstLineHi = _mm_loadu_ps(
    1098          18 :             reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
    1099          18 :         const auto secondLineLo = _mm_loadu_ps(
    1100          18 :             reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
    1101          18 :         const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
    1102          18 :             pSrcScanlineShifted + 4 + nChunkXSize));
    1103             : 
    1104             :         // Vertical addition
    1105          18 :         const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
    1106          18 :         const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
    1107             : 
    1108             :         // Horizontal addition
    1109             :         const auto A =
    1110          18 :             _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
    1111             :         const auto B =
    1112          18 :             _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
    1113          18 :         const auto sum = _mm_add_ps(A, B);
    1114             : 
    1115          18 :         const auto average = _mm_mul_ps(sum, zeroDot25);
    1116             : 
    1117             :         // coverity[incompatible_cast]
    1118          18 :         _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
    1119             :                       average);
    1120          18 :         pSrcScanlineShifted += 8;
    1121             :     }
    1122             : 
    1123          14 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1124          14 :     return iDstPixel;
    1125             : }
    1126             : 
    1127             : #endif
    1128             : 
    1129             : /************************************************************************/
    1130             : /*                    GDALResampleChunk_AverageOrRMS()                  */
    1131             : /************************************************************************/
    1132             : 
    1133             : template <class T, class Tsum, GDALDataType eWrkDataType>
    1134             : static CPLErr
    1135       10390 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
    1136             :                                  const T *pChunk, void **ppDstBuffer)
    1137             : {
    1138       10390 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1139       10390 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1140       10390 :     const double dfSrcXDelta = args.dfSrcXDelta;
    1141       10390 :     const double dfSrcYDelta = args.dfSrcYDelta;
    1142       10390 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1143       10390 :     const int nChunkXOff = args.nChunkXOff;
    1144       10390 :     const int nChunkYOff = args.nChunkYOff;
    1145       10390 :     const int nChunkXSize = args.nChunkXSize;
    1146       10390 :     const int nChunkYSize = args.nChunkYSize;
    1147       10390 :     const int nDstXOff = args.nDstXOff;
    1148       10390 :     const int nDstXOff2 = args.nDstXOff2;
    1149       10390 :     const int nDstYOff = args.nDstYOff;
    1150       10390 :     const int nDstYOff2 = args.nDstYOff2;
    1151       10390 :     const char *pszResampling = args.pszResampling;
    1152       10390 :     bool bHasNoData = args.bHasNoData;
    1153       10390 :     const double dfNoDataValue = args.dfNoDataValue;
    1154       10390 :     const GDALColorTable *poColorTable = args.poColorTable;
    1155       10390 :     const bool bPropagateNoData = args.bPropagateNoData;
    1156             : 
    1157             :     // AVERAGE_BIT2GRAYSCALE
    1158             :     const bool bBit2Grayscale =
    1159       10390 :         CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
    1160       10397 :     const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
    1161       10395 :     if (bBit2Grayscale)
    1162           9 :         poColorTable = nullptr;
    1163             : 
    1164             :     T tNoDataValue;
    1165       10395 :     if (!bHasNoData)
    1166       10344 :         tNoDataValue = 0;
    1167             :     else
    1168          51 :         tNoDataValue = static_cast<T>(dfNoDataValue);
    1169       10395 :     const T tReplacementVal =
    1170         107 :         bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
    1171          51 :                          args.eOvrDataType, dfNoDataValue))
    1172             :                    : 0;
    1173             : 
    1174       10395 :     int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1175       10395 :     int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1176       10395 :     int nDstXWidth = nDstXOff2 - nDstXOff;
    1177             : 
    1178             :     /* -------------------------------------------------------------------- */
    1179             :     /*      Allocate buffers.                                               */
    1180             :     /* -------------------------------------------------------------------- */
    1181       10397 :     *ppDstBuffer = static_cast<T *>(
    1182       10395 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
    1183             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
    1184       10397 :     if (*ppDstBuffer == nullptr)
    1185             :     {
    1186           0 :         return CE_Failure;
    1187             :     }
    1188       10397 :     T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
    1189             : 
    1190             :     struct PrecomputedXValue
    1191             :     {
    1192             :         int nLeftXOffShifted;
    1193             :         int nRightXOffShifted;
    1194             :         double dfLeftWeight;
    1195             :         double dfRightWeight;
    1196             :         double dfTotalWeightFullLine;
    1197             :     };
    1198             : 
    1199             :     PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
    1200       10397 :         VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
    1201             : 
    1202       10393 :     if (pasSrcX == nullptr)
    1203             :     {
    1204           0 :         VSIFree(pasSrcX);
    1205           0 :         return CE_Failure;
    1206             :     }
    1207             : 
    1208       10393 :     int nTransparentIdx = -1;
    1209       10393 :     std::vector<GDALColorEntry> colorEntries;
    1210       10391 :     if (poColorTable)
    1211           5 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1212             : 
    1213             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1214             :     // it as nodata value
    1215       10420 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1216          27 :         tNoDataValue < colorEntries.size())
    1217           1 :         colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
    1218             : 
    1219             :     // Or if we have no explicit nodata, but a color table entry that is
    1220             :     // transparent, consider it as the nodata value
    1221       10392 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1222             :     {
    1223           0 :         bHasNoData = true;
    1224           0 :         tNoDataValue = static_cast<T>(nTransparentIdx);
    1225             :     }
    1226             : 
    1227             :     /* ==================================================================== */
    1228             :     /*      Precompute inner loop constants.                                */
    1229             :     /* ==================================================================== */
    1230       10393 :     bool bSrcXSpacingIsTwo = true;
    1231       10393 :     int nLastSrcXOff2 = -1;
    1232      867056 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1233             :     {
    1234      856663 :         double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    1235             :         // Apply some epsilon to avoid numerical precision issues
    1236      856663 :         int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    1237      856663 :         double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    1238      856663 :         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    1239             : 
    1240      856663 :         if (nSrcXOff < nChunkXOff)
    1241           0 :             nSrcXOff = nChunkXOff;
    1242      856663 :         if (nSrcXOff2 == nSrcXOff)
    1243           0 :             nSrcXOff2++;
    1244      856663 :         if (nSrcXOff2 > nChunkRightXOff)
    1245           1 :             nSrcXOff2 = nChunkRightXOff;
    1246             : 
    1247      856663 :         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
    1248      856663 :         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
    1249      856663 :             nSrcXOff2 - nChunkXOff;
    1250          18 :         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
    1251      856663 :             (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
    1252      856663 :         pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
    1253      856663 :             1 - (nSrcXOff2 - dfSrcXOff2);
    1254      856663 :         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
    1255      856663 :             pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
    1256      856663 :         if (nSrcXOff + 1 < nSrcXOff2)
    1257             :         {
    1258      856635 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1259      856635 :                 nSrcXOff2 - nSrcXOff - 2;
    1260      856635 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1261      856635 :                 pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
    1262             :         }
    1263             : 
    1264      856663 :         if (nSrcXOff2 - nSrcXOff != 2 ||
    1265      727183 :             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
    1266             :         {
    1267      120592 :             bSrcXSpacingIsTwo = false;
    1268             :         }
    1269      856663 :         nLastSrcXOff2 = nSrcXOff2;
    1270             :     }
    1271             : 
    1272             :     /* ==================================================================== */
    1273             :     /*      Loop over destination scanlines.                                */
    1274             :     /* ==================================================================== */
    1275      752820 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1276             :     {
    1277      742425 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    1278      742425 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    1279      742425 :         if (nSrcYOff < nChunkYOff)
    1280           0 :             nSrcYOff = nChunkYOff;
    1281             : 
    1282      742425 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    1283      742425 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    1284      742425 :         if (nSrcYOff2 == nSrcYOff)
    1285           0 :             ++nSrcYOff2;
    1286      742425 :         if (nSrcYOff2 > nChunkBottomYOff)
    1287           3 :             nSrcYOff2 = nChunkBottomYOff;
    1288             : 
    1289      742425 :         T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1290             : 
    1291             :         /* --------------------------------------------------------------------
    1292             :          */
    1293             :         /*      Loop over destination pixels */
    1294             :         /* --------------------------------------------------------------------
    1295             :          */
    1296      742425 :         if (poColorTable == nullptr)
    1297             :         {
    1298      742321 :             if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
    1299             :                 pabyChunkNodataMask == nullptr)
    1300             :             {
    1301             :                 if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
    1302             :                 {
    1303             :                     // Optimized case : no nodata, overview by a factor of 2 and
    1304             :                     // regular x and y src spacing.
    1305      116400 :                     const T *pSrcScanlineShifted =
    1306      116400 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1307      116400 :                         static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
    1308      116400 :                             nChunkXSize;
    1309      116400 :                     int iDstPixel = 0;
    1310             : #ifdef USE_SSE2
    1311      116381 :                     if (bQuadraticMean && eWrkDataType == GDT_Byte)
    1312             :                     {
    1313        5385 :                         iDstPixel = QuadraticMeanByteSSE2OrAVX2(
    1314             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1315             :                             pDstScanline);
    1316             :                     }
    1317      111015 :                     else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
    1318             :                     {
    1319          10 :                         iDstPixel = QuadraticMeanUInt16SSE2(
    1320             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1321             :                             pDstScanline);
    1322             :                     }
    1323             :                     else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
    1324             :                     {
    1325      110996 :                         iDstPixel = AverageByteSSE2OrAVX2(
    1326             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1327             :                             pDstScanline);
    1328             :                     }
    1329             :                     else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
    1330             :                           */
    1331             :                     {
    1332           9 :                         iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
    1333             :                                                       pSrcScanlineShifted,
    1334             :                                                       pDstScanline);
    1335             :                     }
    1336             : #endif
    1337      278721 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1338             :                     {
    1339      162321 :                         Tsum nTotal = 0;
    1340             :                         T nVal;
    1341      162321 :                         if (bQuadraticMean)
    1342          44 :                             nTotal =
    1343          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[0]) +
    1344          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[1]) +
    1345          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
    1346          44 :                                 SQUARE<Tsum>(
    1347          44 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1348             :                         else
    1349      162277 :                             nTotal = pSrcScanlineShifted[0] +
    1350      162277 :                                      pSrcScanlineShifted[1] +
    1351      162277 :                                      pSrcScanlineShifted[nChunkXSize] +
    1352      162277 :                                      pSrcScanlineShifted[1 + nChunkXSize];
    1353             : 
    1354      162321 :                         constexpr int nTotalWeight = 4;
    1355      162321 :                         if (bQuadraticMean)
    1356          44 :                             nVal = ComputeIntegerRMS_4values<T>(nTotal);
    1357             :                         else
    1358      162277 :                             nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
    1359             :                                                   nTotalWeight);
    1360             : 
    1361             :                         // No need to compare nVal against tNoDataValue as we
    1362             :                         // are in a case where pabyChunkNodataMask == nullptr
    1363             :                         // implies the absence of nodata value.
    1364      162321 :                         pDstScanline[iDstPixel] = nVal;
    1365      162321 :                         pSrcScanlineShifted += 2;
    1366             :                     }
    1367             :                 }
    1368             :                 else
    1369             :                 {
    1370             :                     CPLAssert(eWrkDataType == GDT_Float32 ||
    1371             :                               eWrkDataType == GDT_Float64);
    1372          70 :                     const T *pSrcScanlineShifted =
    1373          70 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1374          70 :                         static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
    1375          70 :                             nChunkXSize;
    1376          70 :                     int iDstPixel = 0;
    1377             : #ifdef USE_SSE2
    1378             :                     if (eWrkDataType == GDT_Float32)
    1379             :                     {
    1380          48 :                         if (bQuadraticMean)
    1381             :                         {
    1382          34 :                             iDstPixel = QuadraticMeanFloatSSE2(
    1383             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1384             :                                 pDstScanline);
    1385             :                         }
    1386             :                         else
    1387             :                         {
    1388          14 :                             iDstPixel = AverageFloatSSE2(
    1389             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1390             :                                 pDstScanline);
    1391             :                         }
    1392             :                     }
    1393             : #endif
    1394             : 
    1395         268 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1396             :                     {
    1397             :                         T nVal;
    1398         198 :                         if (bQuadraticMean)
    1399             :                         {
    1400             :                             // Cast to double to avoid overflows
    1401             :                             // (using std::hypot() is much slower)
    1402         100 :                             nVal = static_cast<T>(std::sqrt(
    1403             :                                 0.25 *
    1404         100 :                                 (SQUARE<double>(pSrcScanlineShifted[0]) +
    1405         100 :                                  SQUARE<double>(pSrcScanlineShifted[1]) +
    1406         100 :                                  SQUARE<double>(
    1407         200 :                                      pSrcScanlineShifted[nChunkXSize]) +
    1408         100 :                                  SQUARE<double>(
    1409         100 :                                      pSrcScanlineShifted[1 + nChunkXSize]))));
    1410             :                         }
    1411             :                         else
    1412             :                         {
    1413          98 :                             nVal = static_cast<T>(
    1414          98 :                                 0.25f * (pSrcScanlineShifted[0] +
    1415          98 :                                          pSrcScanlineShifted[1] +
    1416          98 :                                          pSrcScanlineShifted[nChunkXSize] +
    1417          98 :                                          pSrcScanlineShifted[1 + nChunkXSize]));
    1418             :                         }
    1419             : 
    1420             :                         // No need to compare nVal against tNoDataValue as we
    1421             :                         // are in a case where pabyChunkNodataMask == nullptr
    1422             :                         // implies the absence of nodata value.
    1423         198 :                         pDstScanline[iDstPixel] = nVal;
    1424         198 :                         pSrcScanlineShifted += 2;
    1425             :                     }
    1426      116470 :                 }
    1427             :             }
    1428             :             else
    1429             :             {
    1430          24 :                 const double dfBottomWeight =
    1431      625851 :                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0
    1432      625827 :                                                 : 1.0 - (dfSrcYOff - nSrcYOff);
    1433      625851 :                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
    1434      625851 :                 nSrcYOff -= nChunkYOff;
    1435      625851 :                 nSrcYOff2 -= nChunkYOff;
    1436             : 
    1437      625851 :                 double dfTotalWeightFullColumn = dfBottomWeight;
    1438      625851 :                 if (nSrcYOff + 1 < nSrcYOff2)
    1439             :                 {
    1440      625820 :                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
    1441      625820 :                     dfTotalWeightFullColumn += dfTopWeight;
    1442             :                 }
    1443             : 
    1444    18585256 :                 for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1445             :                 {
    1446    17959281 :                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1447    17959281 :                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1448             : 
    1449    17959281 :                     double dfTotal = 0;
    1450    17959281 :                     double dfTotalWeight = 0;
    1451    17959281 :                     if (pabyChunkNodataMask == nullptr)
    1452             :                     {
    1453     1746435 :                         auto pChunkShifted =
    1454         115 :                             pChunk +
    1455     1746435 :                             static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
    1456     1746435 :                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
    1457     1746435 :                         double dfWeightY = dfBottomWeight;
    1458     3493427 :                         while (true)
    1459             :                         {
    1460             :                             double dfTotalLine;
    1461     5239852 :                             if (bQuadraticMean)
    1462             :                             {
    1463             :                                 // Left pixel
    1464             :                                 {
    1465         104 :                                     const T val = pChunkShifted[nSrcXOff];
    1466         104 :                                     dfTotalLine =
    1467         104 :                                         SQUARE<double>(val) *
    1468         104 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1469             :                                 }
    1470             : 
    1471         104 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1472             :                                 {
    1473             :                                     // Middle pixels
    1474         104 :                                     for (int iX = nSrcXOff + 1;
    1475         424 :                                          iX + 1 < nSrcXOff2; ++iX)
    1476             :                                     {
    1477         320 :                                         const T val = pChunkShifted[iX];
    1478         320 :                                         dfTotalLine += SQUARE<double>(val);
    1479             :                                     }
    1480             : 
    1481             :                                     // Right pixel
    1482             :                                     {
    1483         104 :                                         const T val =
    1484         104 :                                             pChunkShifted[nSrcXOff2 - 1];
    1485         104 :                                         dfTotalLine +=
    1486         104 :                                             SQUARE<double>(val) *
    1487         104 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1488             :                                     }
    1489             :                                 }
    1490             :                             }
    1491             :                             else
    1492             :                             {
    1493             :                                 // Left pixel
    1494             :                                 {
    1495     5239756 :                                     const T val = pChunkShifted[nSrcXOff];
    1496     5239756 :                                     dfTotalLine =
    1497     5239756 :                                         val * pasSrcX[iDstPixel].dfLeftWeight;
    1498             :                                 }
    1499             : 
    1500     5239756 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1501             :                                 {
    1502             :                                     // Middle pixels
    1503     4239330 :                                     for (int iX = nSrcXOff + 1;
    1504    64183126 :                                          iX + 1 < nSrcXOff2; ++iX)
    1505             :                                     {
    1506    59943836 :                                         const T val = pChunkShifted[iX];
    1507    59943836 :                                         dfTotalLine += val;
    1508             :                                     }
    1509             : 
    1510             :                                     // Right pixel
    1511             :                                     {
    1512     4239330 :                                         const T val =
    1513     4239330 :                                             pChunkShifted[nSrcXOff2 - 1];
    1514     4239330 :                                         dfTotalLine +=
    1515     4239330 :                                             val *
    1516     4239330 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1517             :                                     }
    1518             :                                 }
    1519             :                             }
    1520             : 
    1521     5239852 :                             dfTotal += dfTotalLine * dfWeightY;
    1522     5239852 :                             --nCounterY;
    1523     5239852 :                             if (nCounterY < 0)
    1524     1746435 :                                 break;
    1525     3493427 :                             pChunkShifted += nChunkXSize;
    1526     3493427 :                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
    1527             :                         }
    1528             : 
    1529     1746435 :                         dfTotalWeight =
    1530     1746435 :                             pasSrcX[iDstPixel].dfTotalWeightFullLine *
    1531             :                             dfTotalWeightFullColumn;
    1532             :                     }
    1533             :                     else
    1534             :                     {
    1535    16212866 :                         GPtrDiff_t nCount = 0;
    1536    71187098 :                         for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1537             :                         {
    1538    54974032 :                             const auto pChunkShifted =
    1539         132 :                                 pChunk +
    1540    54974032 :                                 static_cast<GPtrDiff_t>(iY) * nChunkXSize;
    1541             : 
    1542    54974032 :                             double dfTotalLine = 0;
    1543    54974032 :                             double dfTotalWeightLine = 0;
    1544             :                             // Left pixel
    1545             :                             {
    1546    54974032 :                                 const int iX = nSrcXOff;
    1547    54974032 :                                 const T val = pChunkShifted[iX];
    1548    54974032 :                                 if (pabyChunkNodataMask[iX + iY * nChunkXSize])
    1549             :                                 {
    1550    23417381 :                                     nCount++;
    1551    23417381 :                                     const double dfWeightX =
    1552    23417381 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1553    23417381 :                                     dfTotalWeightLine = dfWeightX;
    1554    23417381 :                                     if (bQuadraticMean)
    1555          60 :                                         dfTotalLine =
    1556          60 :                                             SQUARE<double>(val) * dfWeightX;
    1557             :                                     else
    1558    23417381 :                                         dfTotalLine = val * dfWeightX;
    1559             :                                 }
    1560             :                             }
    1561             : 
    1562    54974032 :                             if (nSrcXOff + 1 < nSrcXOff2)
    1563             :                             {
    1564             :                                 // Middle pixels
    1565   145163132 :                                 for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
    1566             :                                      ++iX)
    1567             :                                 {
    1568    90193400 :                                     const T val = pChunkShifted[iX];
    1569    90193400 :                                     if (pabyChunkNodataMask[iX +
    1570    90193400 :                                                             iY * nChunkXSize])
    1571             :                                     {
    1572    39727500 :                                         nCount++;
    1573    39727500 :                                         dfTotalWeightLine += 1;
    1574    39727500 :                                         if (bQuadraticMean)
    1575           0 :                                             dfTotalLine += SQUARE<double>(val);
    1576             :                                         else
    1577    39727500 :                                             dfTotalLine += val;
    1578             :                                     }
    1579             :                                 }
    1580             : 
    1581             :                                 // Right pixel
    1582             :                                 {
    1583    54969432 :                                     const int iX = nSrcXOff2 - 1;
    1584    54969432 :                                     const T val = pChunkShifted[iX];
    1585    54969432 :                                     if (pabyChunkNodataMask[iX +
    1586    54969432 :                                                             iY * nChunkXSize])
    1587             :                                     {
    1588    23418047 :                                         nCount++;
    1589    23418047 :                                         const double dfWeightX =
    1590    23418047 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1591    23418047 :                                         dfTotalWeightLine += dfWeightX;
    1592    23418047 :                                         if (bQuadraticMean)
    1593         531 :                                             dfTotalLine +=
    1594          61 :                                                 SQUARE<double>(val) * dfWeightX;
    1595             :                                         else
    1596    23417946 :                                             dfTotalLine += val * dfWeightX;
    1597             :                                     }
    1598             :                                 }
    1599             :                             }
    1600             : 
    1601    93736998 :                             const double dfWeightY =
    1602             :                                 (iY == nSrcYOff)        ? dfBottomWeight
    1603    38762766 :                                 : (iY + 1 == nSrcYOff2) ? dfTopWeight
    1604             :                                                         : 1.0;
    1605    54974232 :                             dfTotal += dfTotalLine * dfWeightY;
    1606    54974232 :                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
    1607             :                         }
    1608             : 
    1609    16213066 :                         if (nCount == 0 ||
    1610           8 :                             (bPropagateNoData &&
    1611             :                              nCount <
    1612           8 :                                  static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
    1613           8 :                                      (nSrcXOff2 - nSrcXOff)))
    1614             :                         {
    1615     9461432 :                             pDstScanline[iDstPixel] = tNoDataValue;
    1616     9461432 :                             continue;
    1617             :                         }
    1618             :                     }
    1619             :                     if (eWrkDataType == GDT_Byte)
    1620             :                     {
    1621             :                         T nVal;
    1622     8497910 :                         if (bQuadraticMean)
    1623          38 :                             nVal = ComputeIntegerRMS<T, int>(dfTotal,
    1624             :                                                              dfTotalWeight);
    1625             :                         else
    1626     8497870 :                             nVal =
    1627     8497870 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1628     8497780 :                         if (bHasNoData && nVal == tNoDataValue)
    1629           0 :                             nVal = tReplacementVal;
    1630     8497780 :                         pDstScanline[iDstPixel] = nVal;
    1631             :                     }
    1632             :                     else if (eWrkDataType == GDT_UInt16)
    1633             :                     {
    1634             :                         T nVal;
    1635           8 :                         if (bQuadraticMean)
    1636           4 :                             nVal = ComputeIntegerRMS<T, uint64_t>(
    1637             :                                 dfTotal, dfTotalWeight);
    1638             :                         else
    1639           4 :                             nVal =
    1640           4 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1641           8 :                         if (bHasNoData && nVal == tNoDataValue)
    1642           0 :                             nVal = tReplacementVal;
    1643           8 :                         pDstScanline[iDstPixel] = nVal;
    1644             :                     }
    1645             :                     else
    1646             :                     {
    1647             :                         T nVal;
    1648         151 :                         if (bQuadraticMean)
    1649          20 :                             nVal =
    1650          25 :                                 static_cast<T>(sqrt(dfTotal / dfTotalWeight));
    1651             :                         else
    1652         126 :                             nVal = static_cast<T>(dfTotal / dfTotalWeight);
    1653         151 :                         if (bHasNoData && nVal == tNoDataValue)
    1654           2 :                             nVal = tReplacementVal;
    1655         151 :                         pDstScanline[iDstPixel] = nVal;
    1656             :                     }
    1657             :                 }
    1658             :             }
    1659             :         }
    1660             :         else
    1661             :         {
    1662         104 :             nSrcYOff -= nChunkYOff;
    1663         104 :             nSrcYOff2 -= nChunkYOff;
    1664             : 
    1665        6505 :             for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1666             :             {
    1667        6475 :                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1668        6475 :                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1669             : 
    1670        6475 :                 GPtrDiff_t nTotalR = 0;
    1671        6475 :                 GPtrDiff_t nTotalG = 0;
    1672        6475 :                 GPtrDiff_t nTotalB = 0;
    1673        6475 :                 GPtrDiff_t nCount = 0;
    1674             : 
    1675       19425 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1676             :                 {
    1677       38850 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    1678             :                     {
    1679       25900 :                         const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
    1680       25900 :                                                       nChunkXSize];
    1681             :                         // cppcheck-suppress unsignedLessThanZero
    1682       25900 :                         if (val < 0 || val >= colorEntries.size())
    1683           0 :                             continue;
    1684       25900 :                         size_t idx = static_cast<size_t>(val);
    1685       25900 :                         const auto &entry = colorEntries[idx];
    1686       25900 :                         if (entry.c4)
    1687             :                         {
    1688       14128 :                             if (bQuadraticMean)
    1689             :                             {
    1690         800 :                                 nTotalR += SQUARE<int>(entry.c1);
    1691         800 :                                 nTotalG += SQUARE<int>(entry.c2);
    1692         800 :                                 nTotalB += SQUARE<int>(entry.c3);
    1693         800 :                                 ++nCount;
    1694             :                             }
    1695             :                             else
    1696             :                             {
    1697       13328 :                                 nTotalR += entry.c1;
    1698       13328 :                                 nTotalG += entry.c2;
    1699       13328 :                                 nTotalB += entry.c3;
    1700       13328 :                                 ++nCount;
    1701             :                             }
    1702             :                         }
    1703             :                     }
    1704             :                 }
    1705             : 
    1706        6475 :                 if (nCount == 0 ||
    1707           0 :                     (bPropagateNoData &&
    1708           0 :                      nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
    1709           0 :                                   (nSrcXOff2 - nSrcXOff)))
    1710             :                 {
    1711        2838 :                     pDstScanline[iDstPixel] = tNoDataValue;
    1712             :                 }
    1713             :                 else
    1714             :                 {
    1715             :                     GDALColorEntry color;
    1716        3637 :                     if (bQuadraticMean)
    1717             :                     {
    1718         200 :                         color.c1 =
    1719         200 :                             static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
    1720         200 :                         color.c2 =
    1721         200 :                             static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
    1722         200 :                         color.c3 =
    1723         200 :                             static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
    1724             :                     }
    1725             :                     else
    1726             :                     {
    1727        3437 :                         color.c1 =
    1728        3437 :                             static_cast<short>((nTotalR + nCount / 2) / nCount);
    1729        3437 :                         color.c2 =
    1730        3437 :                             static_cast<short>((nTotalG + nCount / 2) / nCount);
    1731        3437 :                         color.c3 =
    1732        3437 :                             static_cast<short>((nTotalB + nCount / 2) / nCount);
    1733             :                     }
    1734        3563 :                     pDstScanline[iDstPixel] =
    1735        3637 :                         static_cast<T>(BestColorEntry(colorEntries, color));
    1736             :                 }
    1737             :             }
    1738             :         }
    1739             :     }
    1740             : 
    1741       10395 :     CPLFree(pasSrcX);
    1742             : 
    1743       10394 :     return CE_None;
    1744             : }
    1745             : 
    1746             : static CPLErr
    1747       10391 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
    1748             :                                const void *pChunk, void **ppDstBuffer,
    1749             :                                GDALDataType *peDstBufferDataType)
    1750             : {
    1751       10391 :     *peDstBufferDataType = args.eWrkDataType;
    1752       10391 :     switch (args.eWrkDataType)
    1753             :     {
    1754       10326 :         case GDT_Byte:
    1755             :         {
    1756       10326 :             return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
    1757       10324 :                 args, static_cast<const GByte *>(pChunk), ppDstBuffer);
    1758             :         }
    1759             : 
    1760           9 :         case GDT_UInt16:
    1761             :         {
    1762           9 :             if (EQUAL(args.pszResampling, "RMS"))
    1763             :             {
    1764             :                 // Use double as accumulation type, because UInt32 could overflow
    1765             :                 return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
    1766           5 :                                                         GDT_UInt16>(
    1767           5 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1768             :             }
    1769             :             else
    1770             :             {
    1771             :                 return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
    1772           4 :                                                         GDT_UInt16>(
    1773           4 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1774             :             }
    1775             :         }
    1776             : 
    1777          39 :         case GDT_Float32:
    1778             :         {
    1779          39 :             return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
    1780          39 :                 args, static_cast<const float *>(pChunk), ppDstBuffer);
    1781             :         }
    1782             : 
    1783          17 :         case GDT_Float64:
    1784             :         {
    1785             :             return GDALResampleChunk_AverageOrRMS_T<double, double,
    1786          17 :                                                     GDT_Float64>(
    1787          17 :                 args, static_cast<const double *>(pChunk), ppDstBuffer);
    1788             :         }
    1789             : 
    1790           0 :         default:
    1791           0 :             break;
    1792             :     }
    1793             : 
    1794           0 :     CPLAssert(false);
    1795             :     return CE_Failure;
    1796             : }
    1797             : 
    1798             : /************************************************************************/
    1799             : /*                     GDALResampleChunk_Gauss()                        */
    1800             : /************************************************************************/
    1801             : 
    1802          86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
    1803             :                                       const void *pChunk, void **ppDstBuffer,
    1804             :                                       GDALDataType *peDstBufferDataType)
    1805             : 
    1806             : {
    1807          86 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1808          86 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1809          86 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1810          86 :     const int nChunkXOff = args.nChunkXOff;
    1811          86 :     const int nChunkXSize = args.nChunkXSize;
    1812          86 :     const int nChunkYOff = args.nChunkYOff;
    1813          86 :     const int nChunkYSize = args.nChunkYSize;
    1814          86 :     const int nDstXOff = args.nDstXOff;
    1815          86 :     const int nDstXOff2 = args.nDstXOff2;
    1816          86 :     const int nDstYOff = args.nDstYOff;
    1817          86 :     const int nDstYOff2 = args.nDstYOff2;
    1818          86 :     const bool bHasNoData = args.bHasNoData;
    1819          86 :     double dfNoDataValue = args.dfNoDataValue;
    1820          86 :     const GDALColorTable *poColorTable = args.poColorTable;
    1821             : 
    1822          86 :     const double *const padfChunk = static_cast<const double *>(pChunk);
    1823             : 
    1824          86 :     *ppDstBuffer =
    1825          86 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    1826             :                             GDALGetDataTypeSizeBytes(GDT_Float64));
    1827          86 :     if (*ppDstBuffer == nullptr)
    1828             :     {
    1829           0 :         return CE_Failure;
    1830             :     }
    1831          86 :     *peDstBufferDataType = GDT_Float64;
    1832          86 :     double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
    1833             : 
    1834             :     /* -------------------------------------------------------------------- */
    1835             :     /*      Create the filter kernel and allocate scanline buffer.          */
    1836             :     /* -------------------------------------------------------------------- */
    1837          86 :     int nGaussMatrixDim = 3;
    1838             :     const int *panGaussMatrix;
    1839          86 :     constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
    1840          86 :     constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
    1841             :                                         4,  6, 24, 36, 24, 6, 4,  16, 24,
    1842             :                                         16, 4, 1,  4,  6,  4, 1};
    1843          86 :     constexpr int anGaussMatrix7x7[] = {
    1844             :         1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
    1845             :         6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
    1846             :         120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
    1847             :         90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
    1848             : 
    1849          86 :     const int nOXSize = args.nOvrXSize;
    1850          86 :     const int nOYSize = args.nOvrYSize;
    1851          86 :     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1852             : 
    1853             :     // matrix for gauss filter
    1854          86 :     if (nResYFactor <= 2)
    1855             :     {
    1856          85 :         panGaussMatrix = anGaussMatrix3x3;
    1857          85 :         nGaussMatrixDim = 3;
    1858             :     }
    1859           1 :     else if (nResYFactor <= 4)
    1860             :     {
    1861           0 :         panGaussMatrix = anGaussMatrix5x5;
    1862           0 :         nGaussMatrixDim = 5;
    1863             :     }
    1864             :     else
    1865             :     {
    1866           1 :         panGaussMatrix = anGaussMatrix7x7;
    1867           1 :         nGaussMatrixDim = 7;
    1868             :     }
    1869             : 
    1870             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    1871             :     int *panGaussMatrixDup = static_cast<int *>(
    1872             :         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
    1873             :     memcpy(panGaussMatrixDup, panGaussMatrix,
    1874             :            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
    1875             :     panGaussMatrix = panGaussMatrixDup;
    1876             : #endif
    1877             : 
    1878          86 :     if (!bHasNoData)
    1879          79 :         dfNoDataValue = 0.0;
    1880             : 
    1881          86 :     std::vector<GDALColorEntry> colorEntries;
    1882          86 :     int nTransparentIdx = -1;
    1883          86 :     if (poColorTable)
    1884           2 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1885             : 
    1886             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1887             :     // it as nodata value.
    1888          92 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1889           6 :         dfNoDataValue < colorEntries.size())
    1890           0 :         colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
    1891             : 
    1892             :     // Or if we have no explicit nodata, but a color table entry that is
    1893             :     // transparent, consider it as the nodata value.
    1894          86 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1895             :     {
    1896           0 :         dfNoDataValue = nTransparentIdx;
    1897             :     }
    1898             : 
    1899          86 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1900          86 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1901          86 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    1902             : 
    1903             :     /* ==================================================================== */
    1904             :     /*      Loop over destination scanlines.                                */
    1905             :     /* ==================================================================== */
    1906       16488 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1907             :     {
    1908       16402 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    1909       16402 :         int nSrcYOff2 =
    1910       16402 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
    1911             : 
    1912       16402 :         if (nSrcYOff < nChunkYOff)
    1913             :         {
    1914           0 :             nSrcYOff = nChunkYOff;
    1915           0 :             nSrcYOff2++;
    1916             :         }
    1917             : 
    1918       16402 :         const int iSizeY = nSrcYOff2 - nSrcYOff;
    1919       16402 :         nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
    1920       16402 :         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
    1921             : 
    1922       16402 :         if (nSrcYOff2 > nChunkBottomYOff ||
    1923       16359 :             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
    1924             :         {
    1925          44 :             nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
    1926             :         }
    1927             : 
    1928       16402 :         int nYShiftGaussMatrix = 0;
    1929       16402 :         if (nSrcYOff < nChunkYOff)
    1930             :         {
    1931           0 :             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
    1932           0 :             nSrcYOff = nChunkYOff;
    1933             :         }
    1934             : 
    1935       16402 :         const double *const padfSrcScanline =
    1936       16402 :             padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1937       16402 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    1938       16402 :         if (pabyChunkNodataMask != nullptr)
    1939         152 :             pabySrcScanlineNodataMask =
    1940         152 :                 pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1941             : 
    1942             :         /* --------------------------------------------------------------------
    1943             :          */
    1944             :         /*      Loop over destination pixels */
    1945             :         /* --------------------------------------------------------------------
    1946             :          */
    1947       16402 :         double *const padfDstScanline =
    1948       16402 :             padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1949     4149980 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1950             :         {
    1951     4133580 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    1952     4133580 :             int nSrcXOff2 =
    1953     4133580 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
    1954             : 
    1955     4133580 :             if (nSrcXOff < nChunkXOff)
    1956             :             {
    1957           0 :                 nSrcXOff = nChunkXOff;
    1958           0 :                 nSrcXOff2++;
    1959             :             }
    1960             : 
    1961     4133580 :             const int iSizeX = nSrcXOff2 - nSrcXOff;
    1962     4133580 :             nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
    1963     4133580 :             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
    1964             : 
    1965     4133580 :             if (nSrcXOff2 > nChunkRightXOff ||
    1966     4127930 :                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
    1967             :             {
    1968        5650 :                 nSrcXOff2 =
    1969        5650 :                     std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
    1970             :             }
    1971             : 
    1972     4133580 :             int nXShiftGaussMatrix = 0;
    1973     4133580 :             if (nSrcXOff < nChunkXOff)
    1974             :             {
    1975           0 :                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
    1976           0 :                 nSrcXOff = nChunkXOff;
    1977             :             }
    1978             : 
    1979     4133580 :             if (poColorTable == nullptr)
    1980             :             {
    1981     4133380 :                 double dfTotal = 0.0;
    1982     4133380 :                 GInt64 nCount = 0;
    1983     4133380 :                 const int *panLineWeight =
    1984     4133380 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    1985             :                     nXShiftGaussMatrix;
    1986             : 
    1987    16527900 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    1988    12394500 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    1989             :                 {
    1990    49561300 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    1991             :                     {
    1992    37166800 :                         const double val =
    1993    37166800 :                             padfSrcScanline[iX - nChunkXOff +
    1994    37166800 :                                             static_cast<GPtrDiff_t>(iY -
    1995    37166800 :                                                                     nSrcYOff) *
    1996    37166800 :                                                 nChunkXSize];
    1997    37166800 :                         if (pabySrcScanlineNodataMask == nullptr ||
    1998       32872 :                             pabySrcScanlineNodataMask[iX - nChunkXOff +
    1999       32872 :                                                       static_cast<GPtrDiff_t>(
    2000       32872 :                                                           iY - nSrcYOff) *
    2001       32872 :                                                           nChunkXSize])
    2002             :                         {
    2003    37146100 :                             const int nWeight = panLineWeight[i];
    2004    37146100 :                             dfTotal += val * nWeight;
    2005    37146100 :                             nCount += nWeight;
    2006             :                         }
    2007             :                     }
    2008             :                 }
    2009             : 
    2010     4133380 :                 if (nCount == 0)
    2011             :                 {
    2012        2217 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2013             :                 }
    2014             :                 else
    2015             :                 {
    2016     4131160 :                     padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
    2017             :                 }
    2018             :             }
    2019             :             else
    2020             :             {
    2021         200 :                 GInt64 nTotalR = 0;
    2022         200 :                 GInt64 nTotalG = 0;
    2023         200 :                 GInt64 nTotalB = 0;
    2024         200 :                 GInt64 nTotalWeight = 0;
    2025         200 :                 const int *panLineWeight =
    2026         200 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2027             :                     nXShiftGaussMatrix;
    2028             : 
    2029         780 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    2030         580 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    2031             :                 {
    2032        2262 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2033             :                     {
    2034        1682 :                         const double val =
    2035        1682 :                             padfSrcScanline[iX - nChunkXOff +
    2036        1682 :                                             static_cast<GPtrDiff_t>(iY -
    2037        1682 :                                                                     nSrcYOff) *
    2038        1682 :                                                 nChunkXSize];
    2039        1682 :                         if (val < 0 || val >= colorEntries.size())
    2040           0 :                             continue;
    2041             : 
    2042        1682 :                         size_t idx = static_cast<size_t>(val);
    2043        1682 :                         if (colorEntries[idx].c4)
    2044             :                         {
    2045        1682 :                             const int nWeight = panLineWeight[i];
    2046        1682 :                             nTotalR +=
    2047        1682 :                                 static_cast<GInt64>(colorEntries[idx].c1) *
    2048        1682 :                                 nWeight;
    2049        1682 :                             nTotalG +=
    2050        1682 :                                 static_cast<GInt64>(colorEntries[idx].c2) *
    2051        1682 :                                 nWeight;
    2052        1682 :                             nTotalB +=
    2053        1682 :                                 static_cast<GInt64>(colorEntries[idx].c3) *
    2054        1682 :                                 nWeight;
    2055        1682 :                             nTotalWeight += nWeight;
    2056             :                         }
    2057             :                     }
    2058             :                 }
    2059             : 
    2060         200 :                 if (nTotalWeight == 0)
    2061             :                 {
    2062           0 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2063             :                 }
    2064             :                 else
    2065             :                 {
    2066             :                     GDALColorEntry color;
    2067             : 
    2068         200 :                     color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
    2069             :                                                   nTotalWeight);
    2070         200 :                     color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
    2071             :                                                   nTotalWeight);
    2072         200 :                     color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
    2073             :                                                   nTotalWeight);
    2074         200 :                     padfDstScanline[iDstPixel - nDstXOff] =
    2075         200 :                         BestColorEntry(colorEntries, color);
    2076             :                 }
    2077             :             }
    2078             :         }
    2079             :     }
    2080             : 
    2081             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2082             :     CPLFree(panGaussMatrixDup);
    2083             : #endif
    2084             : 
    2085          86 :     return CE_None;
    2086             : }
    2087             : 
    2088             : /************************************************************************/
    2089             : /*                      GDALResampleChunk_Mode()                        */
    2090             : /************************************************************************/
    2091             : 
    2092        4398 : template <class T> static inline bool IsSame(T a, T b)
    2093             : {
    2094        4398 :     return a == b;
    2095             : }
    2096             : 
    2097        4854 : template <> bool IsSame<float>(float a, float b)
    2098             : {
    2099        4854 :     return a == b || (std::isnan(a) && std::isnan(b));
    2100             : }
    2101             : 
    2102         504 : template <> bool IsSame<double>(double a, double b)
    2103             : {
    2104         504 :     return a == b || (std::isnan(a) && std::isnan(b));
    2105             : }
    2106             : 
    2107             : template <>
    2108         480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
    2109             : {
    2110         960 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2111         960 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2112             : }
    2113             : 
    2114             : template <>
    2115         480 : bool IsSame<std::complex<double>>(std::complex<double> a,
    2116             :                                   std::complex<double> b)
    2117             : {
    2118         960 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2119         960 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2120             : }
    2121             : 
    2122             : template <class T>
    2123         136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
    2124             :                                       const T *pChunk, T *const pDstBuffer)
    2125             : 
    2126             : {
    2127         136 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2128         136 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2129         136 :     const double dfSrcXDelta = args.dfSrcXDelta;
    2130         136 :     const double dfSrcYDelta = args.dfSrcYDelta;
    2131         136 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2132         136 :     const int nChunkXOff = args.nChunkXOff;
    2133         136 :     const int nChunkXSize = args.nChunkXSize;
    2134         136 :     const int nChunkYOff = args.nChunkYOff;
    2135         136 :     const int nChunkYSize = args.nChunkYSize;
    2136         136 :     const int nDstXOff = args.nDstXOff;
    2137         136 :     const int nDstXOff2 = args.nDstXOff2;
    2138         136 :     const int nDstYOff = args.nDstYOff;
    2139         136 :     const int nDstYOff2 = args.nDstYOff2;
    2140         136 :     const bool bHasNoData = args.bHasNoData;
    2141         136 :     const GDALColorTable *poColorTable = args.poColorTable;
    2142         136 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    2143             : 
    2144           8 :     T tNoDataValue;
    2145             :     if constexpr (std::is_same<T, std::complex<float>>::value ||
    2146             :                   std::is_same<T, std::complex<double>>::value)
    2147             :     {
    2148             :         using BaseT = typename T::value_type;
    2149           8 :         tNoDataValue =
    2150             :             std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
    2151             :                                 std::numeric_limits<BaseT>::quiet_NaN());
    2152             :     }
    2153         128 :     else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
    2154         127 :         tNoDataValue = 0;
    2155             :     else
    2156           1 :         tNoDataValue = static_cast<T>(args.dfNoDataValue);
    2157             : 
    2158         136 :     size_t nMaxNumPx = 0;
    2159         136 :     T *paVals = nullptr;
    2160         136 :     int *panSums = nullptr;
    2161             : 
    2162         136 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2163         136 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2164         272 :     std::vector<int> anVals(256, 0);
    2165             : 
    2166             :     /* ==================================================================== */
    2167             :     /*      Loop over destination scanlines.                                */
    2168             :     /* ==================================================================== */
    2169        7531 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2170             :     {
    2171        7395 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    2172        7395 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    2173             : #ifdef only_pixels_with_more_than_10_pct_participation
    2174             :         // When oversampling, don't take into account pixels that have a tiny
    2175             :         // participation in the resulting pixel
    2176             :         if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
    2177             :             nSrcYOff < nChunkBottomYOff)
    2178             :             nSrcYOff++;
    2179             : #endif
    2180        7395 :         if (nSrcYOff < nChunkYOff)
    2181           0 :             nSrcYOff = nChunkYOff;
    2182             : 
    2183        7395 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    2184        7395 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    2185             : #ifdef only_pixels_with_more_than_10_pct_participation
    2186             :         // When oversampling, don't take into account pixels that have a tiny
    2187             :         // participation in the resulting pixel
    2188             :         if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
    2189             :             nSrcYOff2 > nChunkYOff)
    2190             :             nSrcYOff2--;
    2191             : #endif
    2192        7395 :         if (nSrcYOff2 == nSrcYOff)
    2193           0 :             ++nSrcYOff2;
    2194        7395 :         if (nSrcYOff2 > nChunkBottomYOff)
    2195           0 :             nSrcYOff2 = nChunkBottomYOff;
    2196             : 
    2197        7395 :         const T *const paSrcScanline =
    2198         149 :             pChunk +
    2199        7395 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
    2200        7395 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2201        7395 :         if (pabyChunkNodataMask != nullptr)
    2202        1810 :             pabySrcScanlineNodataMask =
    2203             :                 pabyChunkNodataMask +
    2204        1810 :                 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
    2205             : 
    2206        7395 :         T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
    2207             :         /* --------------------------------------------------------------------
    2208             :          */
    2209             :         /*      Loop over destination pixels */
    2210             :         /* --------------------------------------------------------------------
    2211             :          */
    2212     4259580 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2213             :         {
    2214     4252187 :             double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    2215             :             // Apply some epsilon to avoid numerical precision issues
    2216     4252187 :             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    2217             : #ifdef only_pixels_with_more_than_10_pct_participation
    2218             :             // When oversampling, don't take into account pixels that have a
    2219             :             // tiny participation in the resulting pixel
    2220             :             if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
    2221             :                 nSrcXOff < nChunkRightXOff)
    2222             :                 nSrcXOff++;
    2223             : #endif
    2224     4252187 :             if (nSrcXOff < nChunkXOff)
    2225           0 :                 nSrcXOff = nChunkXOff;
    2226             : 
    2227     4252187 :             double dfSrcXOff2 =
    2228     4252187 :                 dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    2229     4252187 :             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    2230             : #ifdef only_pixels_with_more_than_10_pct_participation
    2231             :             // When oversampling, don't take into account pixels that have a
    2232             :             // tiny participation in the resulting pixel
    2233             :             if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
    2234             :                 nSrcXOff2 > nChunkXOff)
    2235             :                 nSrcXOff2--;
    2236             : #endif
    2237     4252187 :             if (nSrcXOff2 == nSrcXOff)
    2238           0 :                 nSrcXOff2++;
    2239     4252187 :             if (nSrcXOff2 > nChunkRightXOff)
    2240           0 :                 nSrcXOff2 = nChunkRightXOff;
    2241             : 
    2242     4252187 :             bool bRegularProcessing = false;
    2243             :             if constexpr (!std::is_same<T, GByte>::value)
    2244         827 :                 bRegularProcessing = true;
    2245     4251360 :             else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
    2246           0 :                 bRegularProcessing = true;
    2247             : 
    2248     4252187 :             if (bRegularProcessing)
    2249             :             {
    2250             :                 // Not sure how much sense it makes to run a majority
    2251             :                 // filter on floating point data, but here it is for the sake
    2252             :                 // of compatibility. It won't look right on RGB images by the
    2253             :                 // nature of the filter.
    2254             : 
    2255         827 :                 if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
    2256        2481 :                     nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
    2257         827 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2258         827 :                             static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
    2259         827 :                         std::numeric_limits<size_t>::max() / sizeof(float))
    2260             :                 {
    2261           0 :                     CPLError(CE_Failure, CPLE_NotSupported,
    2262             :                              "Too big downsampling factor");
    2263           0 :                     CPLFree(paVals);
    2264           0 :                     CPLFree(panSums);
    2265           0 :                     return CE_Failure;
    2266             :                 }
    2267         827 :                 const size_t nNumPx =
    2268         827 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2269         827 :                     static_cast<size_t>(nSrcXOff2 - nSrcXOff);
    2270         827 :                 size_t iMaxInd = 0;
    2271         827 :                 size_t iMaxVal = 0;
    2272         827 :                 bool biMaxValdValid = false;
    2273             : 
    2274         827 :                 if (paVals == nullptr || nNumPx > nMaxNumPx)
    2275             :                 {
    2276             :                     T *paValsNew = static_cast<T *>(
    2277          71 :                         VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
    2278             :                     int *panSumsNew = static_cast<int *>(
    2279          71 :                         VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
    2280          71 :                     if (paValsNew != nullptr)
    2281          71 :                         paVals = paValsNew;
    2282          71 :                     if (panSumsNew != nullptr)
    2283          71 :                         panSums = panSumsNew;
    2284          71 :                     if (paValsNew == nullptr || panSumsNew == nullptr)
    2285             :                     {
    2286           0 :                         CPLFree(paVals);
    2287           0 :                         CPLFree(panSums);
    2288           0 :                         return CE_Failure;
    2289             :                     }
    2290          71 :                     nMaxNumPx = nNumPx;
    2291             :                 }
    2292             : 
    2293        2585 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2294             :                 {
    2295        1758 :                     const GPtrDiff_t iTotYOff =
    2296        1758 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2297        1758 :                         nChunkXOff;
    2298        5690 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2299             :                     {
    2300        3932 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2301          16 :                             pabySrcScanlineNodataMask[iX + iTotYOff])
    2302             :                         {
    2303        3917 :                             const T val = paSrcScanline[iX + iTotYOff];
    2304        3917 :                             size_t i = 0;  // Used after for.
    2305             : 
    2306             :                             // Check array for existing entry.
    2307       14387 :                             for (; i < iMaxInd; ++i)
    2308       17626 :                                 if (IsSame(paVals[i], val) &&
    2309        6910 :                                     ++panSums[i] > panSums[iMaxVal])
    2310             :                                 {
    2311         246 :                                     iMaxVal = i;
    2312         246 :                                     biMaxValdValid = true;
    2313         246 :                                     break;
    2314             :                                 }
    2315             : 
    2316             :                             // Add to arr if entry not already there.
    2317        3917 :                             if (i == iMaxInd)
    2318             :                             {
    2319        3671 :                                 paVals[iMaxInd] = val;
    2320        3671 :                                 panSums[iMaxInd] = 1;
    2321             : 
    2322        3671 :                                 if (!biMaxValdValid)
    2323             :                                 {
    2324         824 :                                     iMaxVal = iMaxInd;
    2325         824 :                                     biMaxValdValid = true;
    2326             :                                 }
    2327             : 
    2328        3671 :                                 ++iMaxInd;
    2329             :                             }
    2330             :                         }
    2331             :                     }
    2332             :                 }
    2333             : 
    2334         827 :                 if (!biMaxValdValid)
    2335           3 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2336             :                 else
    2337         824 :                     paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
    2338             :             }
    2339             :             else if constexpr (std::is_same<T, GByte>::value)
    2340             :             // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
    2341             :             {
    2342             :                 // So we go here for a paletted or non-paletted byte band.
    2343             :                 // The input values are then between 0 and 255.
    2344     4251360 :                 int nMaxVal = 0;
    2345     4251360 :                 int iMaxInd = -1;
    2346             : 
    2347             :                 // The cost of this zeroing might be high. Perhaps we should
    2348             :                 // just use the above generic case, and go to this one if the
    2349             :                 // number of source pixels is large enough
    2350     4251360 :                 std::fill(anVals.begin(), anVals.end(), 0);
    2351             : 
    2352    12777700 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2353             :                 {
    2354     8526370 :                     const GPtrDiff_t iTotYOff =
    2355     8526370 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2356     8526370 :                         nChunkXOff;
    2357    25649400 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2358             :                     {
    2359    17123000 :                         const T val = paSrcScanline[iX + iTotYOff];
    2360    17123000 :                         if (!bHasNoData || val != tNoDataValue)
    2361             :                         {
    2362    17123000 :                             int nVal = static_cast<int>(val);
    2363    17123000 :                             if (++anVals[nVal] > nMaxVal)
    2364             :                             {
    2365             :                                 // Sum the density.
    2366             :                                 // Is it the most common value so far?
    2367    17006300 :                                 iMaxInd = nVal;
    2368    17006300 :                                 nMaxVal = anVals[nVal];
    2369             :                             }
    2370             :                         }
    2371             :                     }
    2372             :                 }
    2373             : 
    2374     4251360 :                 if (iMaxInd == -1)
    2375           0 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2376             :                 else
    2377     4251360 :                     paDstScanline[iDstPixel - nDstXOff] =
    2378             :                         static_cast<T>(iMaxInd);
    2379             :             }
    2380             :         }
    2381             :     }
    2382             : 
    2383         136 :     CPLFree(paVals);
    2384         136 :     CPLFree(panSums);
    2385             : 
    2386         136 :     return CE_None;
    2387             : }
    2388             : 
    2389         136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
    2390             :                                      const void *pChunk, void **ppDstBuffer,
    2391             :                                      GDALDataType *peDstBufferDataType)
    2392             : {
    2393         136 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    2394             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    2395             :         GDALGetDataTypeSizeBytes(args.eWrkDataType));
    2396         136 :     if (*ppDstBuffer == nullptr)
    2397             :     {
    2398           0 :         return CE_Failure;
    2399             :     }
    2400             : 
    2401         136 :     CPLAssert(args.eSrcDataType == args.eWrkDataType);
    2402             : 
    2403         136 :     *peDstBufferDataType = args.eWrkDataType;
    2404         136 :     switch (args.eWrkDataType)
    2405             :     {
    2406             :         // For mode resampling, as no computation is done, only the
    2407             :         // size of the data type matters... except for Byte where we have
    2408             :         // special processing. And for floating point values
    2409          65 :         case GDT_Byte:
    2410             :         {
    2411          65 :             return GDALResampleChunk_ModeT(args,
    2412             :                                            static_cast<const GByte *>(pChunk),
    2413          65 :                                            static_cast<GByte *>(*ppDstBuffer));
    2414             :         }
    2415             : 
    2416           4 :         case GDT_Int8:
    2417             :         {
    2418           4 :             return GDALResampleChunk_ModeT(args,
    2419             :                                            static_cast<const int8_t *>(pChunk),
    2420           4 :                                            static_cast<int8_t *>(*ppDstBuffer));
    2421             :         }
    2422             : 
    2423           9 :         case GDT_Int16:
    2424             :         case GDT_UInt16:
    2425             :         {
    2426           9 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
    2427           9 :             return GDALResampleChunk_ModeT(
    2428             :                 args, static_cast<const uint16_t *>(pChunk),
    2429           9 :                 static_cast<uint16_t *>(*ppDstBuffer));
    2430             :         }
    2431             : 
    2432          15 :         case GDT_CInt16:
    2433             :         case GDT_Int32:
    2434             :         case GDT_UInt32:
    2435             :         {
    2436          15 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2437          15 :             return GDALResampleChunk_ModeT(
    2438             :                 args, static_cast<const uint32_t *>(pChunk),
    2439          15 :                 static_cast<uint32_t *>(*ppDstBuffer));
    2440             :         }
    2441             : 
    2442          17 :         case GDT_Float32:
    2443             :         {
    2444          17 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2445          17 :             return GDALResampleChunk_ModeT(args,
    2446             :                                            static_cast<const float *>(pChunk),
    2447          17 :                                            static_cast<float *>(*ppDstBuffer));
    2448             :         }
    2449             : 
    2450          12 :         case GDT_CInt32:
    2451             :         case GDT_Int64:
    2452             :         case GDT_UInt64:
    2453             :         {
    2454          12 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2455          12 :             return GDALResampleChunk_ModeT(
    2456             :                 args, static_cast<const uint64_t *>(pChunk),
    2457          12 :                 static_cast<uint64_t *>(*ppDstBuffer));
    2458             :         }
    2459             : 
    2460           6 :         case GDT_Float64:
    2461             :         {
    2462           6 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2463           6 :             return GDALResampleChunk_ModeT(args,
    2464             :                                            static_cast<const double *>(pChunk),
    2465           6 :                                            static_cast<double *>(*ppDstBuffer));
    2466             :         }
    2467             : 
    2468           4 :         case GDT_CFloat32:
    2469             :         {
    2470           4 :             return GDALResampleChunk_ModeT(
    2471             :                 args, static_cast<const std::complex<float> *>(pChunk),
    2472           4 :                 static_cast<std::complex<float> *>(*ppDstBuffer));
    2473             :         }
    2474             : 
    2475           4 :         case GDT_CFloat64:
    2476             :         {
    2477           4 :             return GDALResampleChunk_ModeT(
    2478             :                 args, static_cast<const std::complex<double> *>(pChunk),
    2479           4 :                 static_cast<std::complex<double> *>(*ppDstBuffer));
    2480             :         }
    2481             : 
    2482           0 :         case GDT_Unknown:
    2483             :         case GDT_TypeCount:
    2484           0 :             break;
    2485             :     }
    2486             : 
    2487           0 :     CPLAssert(false);
    2488             :     return CE_Failure;
    2489             : }
    2490             : 
    2491             : /************************************************************************/
    2492             : /*                  GDALResampleConvolutionHorizontal()                 */
    2493             : /************************************************************************/
    2494             : 
    2495             : template <class T>
    2496             : static inline double
    2497       44642 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
    2498             :                                   int nSrcPixelCount)
    2499             : {
    2500       44642 :     double dfVal1 = 0.0;
    2501       44642 :     double dfVal2 = 0.0;
    2502       44642 :     int i = 0;  // Used after for.
    2503             :     // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
    2504             :     // manually (untypical) unrolled loop in -O2 and -O3:
    2505             :     // https://github.com/OSGeo/gdal/issues/9508
    2506             : #if !defined(__INTEL_CLANG_COMPILER)
    2507       89044 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2508             :     {
    2509       44402 :         dfVal1 += pChunk[i] * padfWeights[i];
    2510       44402 :         dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
    2511       44402 :         dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
    2512       44402 :         dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
    2513             :     }
    2514             : #endif
    2515       46066 :     for (; i < nSrcPixelCount; ++i)
    2516             :     {
    2517        1424 :         dfVal1 += pChunk[i] * padfWeights[i];
    2518             :     }
    2519       44642 :     return dfVal1 + dfVal2;
    2520             : }
    2521             : 
    2522             : template <class T>
    2523          48 : static inline void GDALResampleConvolutionHorizontalWithMask(
    2524             :     const T *pChunk, const GByte *pabyMask, const double *padfWeights,
    2525             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2526             : {
    2527          48 :     dfVal = 0;
    2528          48 :     dfWeightSum = 0;
    2529          48 :     int i = 0;
    2530          48 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2531             :     {
    2532           0 :         const double dfWeight0 = padfWeights[i] * pabyMask[i];
    2533           0 :         const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
    2534           0 :         const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
    2535           0 :         const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
    2536           0 :         dfVal += pChunk[i] * dfWeight0;
    2537           0 :         dfVal += pChunk[i + 1] * dfWeight1;
    2538           0 :         dfVal += pChunk[i + 2] * dfWeight2;
    2539           0 :         dfVal += pChunk[i + 3] * dfWeight3;
    2540           0 :         dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
    2541             :     }
    2542         178 :     for (; i < nSrcPixelCount; ++i)
    2543             :     {
    2544         130 :         const double dfWeight = padfWeights[i] * pabyMask[i];
    2545         130 :         dfVal += pChunk[i] * dfWeight;
    2546         130 :         dfWeightSum += dfWeight;
    2547             :     }
    2548          48 : }
    2549             : 
    2550             : template <class T>
    2551     1330334 : static inline void GDALResampleConvolutionHorizontal_3rows(
    2552             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2553             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2554             :     double &dfRes2, double &dfRes3)
    2555             : {
    2556     1330334 :     double dfVal1 = 0.0;
    2557     1330334 :     double dfVal2 = 0.0;
    2558     1330334 :     double dfVal3 = 0.0;
    2559     1330334 :     double dfVal4 = 0.0;
    2560     1330334 :     double dfVal5 = 0.0;
    2561     1330334 :     double dfVal6 = 0.0;
    2562     1330334 :     int i = 0;  // Used after for.
    2563     2715057 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2564             :     {
    2565     1384722 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2566     1384722 :         dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
    2567     1384722 :         dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
    2568     1384722 :         dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
    2569     1384722 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2570     1384722 :         dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
    2571     1384722 :         dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
    2572     1384722 :         dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
    2573     1384722 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2574     1384722 :         dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
    2575     1384722 :         dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
    2576     1384722 :         dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
    2577             :     }
    2578     1366941 :     for (; i < nSrcPixelCount; ++i)
    2579             :     {
    2580       36607 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2581       36607 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2582       36607 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2583             :     }
    2584     1330334 :     dfRes1 = dfVal1 + dfVal2;
    2585     1330334 :     dfRes2 = dfVal3 + dfVal4;
    2586     1330334 :     dfRes3 = dfVal5 + dfVal6;
    2587     1330334 : }
    2588             : 
    2589             : template <class T>
    2590       18188 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    2591             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2592             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2593             :     double &dfRes2, double &dfRes3)
    2594             : {
    2595       18188 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2596             :                                             padfWeights, nSrcPixelCount, dfRes1,
    2597             :                                             dfRes2, dfRes3);
    2598       18188 : }
    2599             : 
    2600             : template <class T>
    2601     1247346 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
    2602             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2603             :     const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
    2604             : {
    2605     1247346 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2606             :                                             padfWeights, 4, dfRes1, dfRes2,
    2607             :                                             dfRes3);
    2608     1247346 : }
    2609             : 
    2610             : /************************************************************************/
    2611             : /*                  GDALResampleConvolutionVertical()                   */
    2612             : /************************************************************************/
    2613             : 
    2614             : template <class T>
    2615             : static inline double
    2616      463157 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
    2617             :                                 const double *padfWeights, int nSrcLineCount)
    2618             : {
    2619      463157 :     double dfVal1 = 0.0;
    2620      463157 :     double dfVal2 = 0.0;
    2621      463157 :     int i = 0;
    2622      463157 :     int j = 0;
    2623      912074 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2624             :     {
    2625      448917 :         dfVal1 += pChunk[j] * padfWeights[i];
    2626      448917 :         dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
    2627      448917 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2628      448917 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2629             :     }
    2630      515480 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2631             :     {
    2632       52323 :         dfVal1 += pChunk[j] * padfWeights[i];
    2633             :     }
    2634      463157 :     return dfVal1 + dfVal2;
    2635             : }
    2636             : 
    2637             : template <class T>
    2638     2880000 : static inline void GDALResampleConvolutionVertical_2cols(
    2639             :     const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
    2640             :     double &dfRes1, double &dfRes2)
    2641             : {
    2642     2880000 :     double dfVal1 = 0.0;
    2643     2880000 :     double dfVal2 = 0.0;
    2644     2880000 :     double dfVal3 = 0.0;
    2645     2880000 :     double dfVal4 = 0.0;
    2646     2880000 :     int i = 0;
    2647     2880000 :     int j = 0;
    2648     5716800 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2649             :     {
    2650     2836800 :         dfVal1 += pChunk[j] * padfWeights[i];
    2651     2836800 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2652     2836800 :         dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
    2653     2836800 :         dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
    2654     2836800 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2655     2836800 :         dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
    2656     2836800 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2657     2836800 :         dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
    2658             :     }
    2659     2995210 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2660             :     {
    2661      115210 :         dfVal1 += pChunk[j] * padfWeights[i];
    2662      115210 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2663             :     }
    2664     2880000 :     dfRes1 = dfVal1 + dfVal2;
    2665     2880000 :     dfRes2 = dfVal3 + dfVal4;
    2666     2880000 : }
    2667             : 
    2668             : #ifdef USE_SSE2
    2669             : 
    2670             : #ifdef __AVX__
    2671             : /************************************************************************/
    2672             : /*             GDALResampleConvolutionVertical_16cols<T>                */
    2673             : /************************************************************************/
    2674             : 
    2675             : template <class T>
    2676             : static inline void
    2677             : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
    2678             :                                        const double *padfWeights,
    2679             :                                        int nSrcLineCount, float *afDest)
    2680             : {
    2681             :     int i = 0;
    2682             :     int j = 0;
    2683             :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2684             :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2685             :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2686             :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2687             :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2688             :     {
    2689             :         XMMReg4Double w0 =
    2690             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2691             :         XMMReg4Double w1 =
    2692             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2693             :         XMMReg4Double w2 =
    2694             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2695             :         XMMReg4Double w3 =
    2696             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2697             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2698             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2699             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
    2700             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
    2701             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2702             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2703             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
    2704             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
    2705             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2706             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2707             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
    2708             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
    2709             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2710             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2711             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
    2712             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
    2713             :     }
    2714             :     for (; i < nSrcLineCount; ++i, j += nStride)
    2715             :     {
    2716             :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2717             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2718             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2719             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
    2720             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
    2721             :     }
    2722             :     v_acc0.Store4Val(afDest);
    2723             :     v_acc1.Store4Val(afDest + 4);
    2724             :     v_acc2.Store4Val(afDest + 8);
    2725             :     v_acc3.Store4Val(afDest + 12);
    2726             : }
    2727             : 
    2728             : template <class T>
    2729             : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
    2730             :                                                           const double *, int,
    2731             :                                                           double *)
    2732             : {
    2733             :     // Cannot be reached
    2734             :     CPLAssert(false);
    2735             : }
    2736             : 
    2737             : #else
    2738             : 
    2739             : /************************************************************************/
    2740             : /*              GDALResampleConvolutionVertical_8cols<T>                */
    2741             : /************************************************************************/
    2742             : 
    2743             : template <class T>
    2744             : static inline void
    2745    18613000 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
    2746             :                                       const double *padfWeights,
    2747             :                                       int nSrcLineCount, float *afDest)
    2748             : {
    2749    18613000 :     int i = 0;
    2750    18613000 :     int j = 0;
    2751    18613000 :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2752    18428000 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2753    33729700 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2754             :     {
    2755    15155500 :         XMMReg4Double w0 =
    2756    15155500 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2757    15090400 :         XMMReg4Double w1 =
    2758    15090400 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2759    15045700 :         XMMReg4Double w2 =
    2760    15045700 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2761    15073900 :         XMMReg4Double w3 =
    2762    15073900 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2763    15084900 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2764    15065600 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2765    15116300 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2766    15137700 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2767    15137000 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2768    15110400 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2769    15126700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2770    15132700 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2771             :     }
    2772    29981300 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2773             :     {
    2774    11407100 :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2775    11407100 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2776    11407100 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2777             :     }
    2778    18574200 :     v_acc0.Store4Val(afDest);
    2779    18545900 :     v_acc1.Store4Val(afDest + 4);
    2780    18572300 : }
    2781             : 
    2782             : template <class T>
    2783             : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
    2784             :                                                          const double *, int,
    2785             :                                                          double *)
    2786             : {
    2787             :     // Cannot be reached
    2788             :     CPLAssert(false);
    2789             : }
    2790             : 
    2791             : #endif  // __AVX__
    2792             : 
    2793             : /************************************************************************/
    2794             : /*              GDALResampleConvolutionHorizontalSSE2<T>                */
    2795             : /************************************************************************/
    2796             : 
    2797             : template <class T>
    2798     2737535 : static inline double GDALResampleConvolutionHorizontalSSE2(
    2799             :     const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2800             : {
    2801     2737535 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2802     2737003 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2803     2737344 :     int i = 0;  // Used after for.
    2804     2813017 :     for (; i + 7 < nSrcPixelCount; i += 8)
    2805             :     {
    2806             :         // Retrieve the pixel & accumulate
    2807       75571 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
    2808       75571 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
    2809       75571 :         const XMMReg4Double v_weight1 =
    2810       75571 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2811       75571 :         const XMMReg4Double v_weight2 =
    2812       75571 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2813             : 
    2814       75571 :         v_acc1 += v_pixels1 * v_weight1;
    2815       75571 :         v_acc2 += v_pixels2 * v_weight2;
    2816             :     }
    2817             : 
    2818     2737445 :     v_acc1 += v_acc2;
    2819             : 
    2820     2737358 :     double dfVal = v_acc1.GetHorizSum();
    2821     9501560 :     for (; i < nSrcPixelCount; ++i)
    2822             :     {
    2823     6764520 :         dfVal += pChunk[i] * padfWeightsAligned[i];
    2824             :     }
    2825     2737029 :     return dfVal;
    2826             : }
    2827             : 
    2828             : /************************************************************************/
    2829             : /*              GDALResampleConvolutionHorizontal<GByte>                */
    2830             : /************************************************************************/
    2831             : 
    2832             : template <>
    2833     2189530 : inline double GDALResampleConvolutionHorizontal<GByte>(
    2834             :     const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2835             : {
    2836     2189530 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2837     2189530 :                                                  nSrcPixelCount);
    2838             : }
    2839             : 
    2840             : template <>
    2841      548160 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
    2842             :     const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2843             : {
    2844      548160 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2845      548433 :                                                  nSrcPixelCount);
    2846             : }
    2847             : 
    2848             : /************************************************************************/
    2849             : /*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
    2850             : /************************************************************************/
    2851             : 
    2852             : template <class T>
    2853     5736213 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
    2854             :     const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
    2855             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2856             : {
    2857     5736213 :     int i = 0;  // Used after for.
    2858     5736213 :     XMMReg4Double v_acc = XMMReg4Double::Zero();
    2859     5736213 :     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
    2860    16247021 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2861             :     {
    2862    10510858 :         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
    2863    10510858 :         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
    2864    10510858 :         XMMReg4Double v_weight =
    2865    10510858 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2866    10510858 :         v_weight *= v_mask;
    2867    10510858 :         v_acc += v_pixels * v_weight;
    2868    10510858 :         v_acc_weight += v_weight;
    2869             :     }
    2870             : 
    2871     5736213 :     dfVal = v_acc.GetHorizSum();
    2872     5736213 :     dfWeightSum = v_acc_weight.GetHorizSum();
    2873     5927983 :     for (; i < nSrcPixelCount; ++i)
    2874             :     {
    2875      191772 :         const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
    2876      191772 :         dfVal += pChunk[i] * dfWeight;
    2877      191772 :         dfWeightSum += dfWeight;
    2878             :     }
    2879     5736213 : }
    2880             : 
    2881             : /************************************************************************/
    2882             : /*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
    2883             : /************************************************************************/
    2884             : 
    2885             : template <>
    2886     5736150 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
    2887             :     const GByte *pChunk, const GByte *pabyMask,
    2888             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2889             :     double &dfWeightSum)
    2890             : {
    2891     5736150 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2892             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2893             :         dfWeightSum);
    2894     5736150 : }
    2895             : 
    2896             : template <>
    2897          63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
    2898             :     const GUInt16 *pChunk, const GByte *pabyMask,
    2899             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2900             :     double &dfWeightSum)
    2901             : {
    2902          63 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2903             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2904             :         dfWeightSum);
    2905          63 : }
    2906             : 
    2907             : /************************************************************************/
    2908             : /*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
    2909             : /************************************************************************/
    2910             : 
    2911             : template <class T>
    2912    10023630 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
    2913             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2914             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2915             :     double &dfRes2, double &dfRes3)
    2916             : {
    2917    10023630 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
    2918    10023630 :                   v_acc2 = XMMReg4Double::Zero(),
    2919    10023630 :                   v_acc3 = XMMReg4Double::Zero();
    2920    10023630 :     int i = 0;
    2921    19989466 :     for (; i + 7 < nSrcPixelCount; i += 8)
    2922             :     {
    2923             :         // Retrieve the pixel & accumulate.
    2924     9965826 :         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    2925     9965826 :         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
    2926     9965826 :         const XMMReg4Double v_weight1 =
    2927     9965826 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2928     9965826 :         const XMMReg4Double v_weight2 =
    2929     9965826 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2930             : 
    2931     9965826 :         v_acc1 += v_pixels1 * v_weight1;
    2932     9965826 :         v_acc1 += v_pixels2 * v_weight2;
    2933             : 
    2934     9965826 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    2935     9965826 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
    2936     9965826 :         v_acc2 += v_pixels1 * v_weight1;
    2937     9965826 :         v_acc2 += v_pixels2 * v_weight2;
    2938             : 
    2939     9965826 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    2940     9965826 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
    2941     9965826 :         v_acc3 += v_pixels1 * v_weight1;
    2942     9965826 :         v_acc3 += v_pixels2 * v_weight2;
    2943             :     }
    2944             : 
    2945    10023630 :     dfRes1 = v_acc1.GetHorizSum();
    2946    10023630 :     dfRes2 = v_acc2.GetHorizSum();
    2947    10023630 :     dfRes3 = v_acc3.GetHorizSum();
    2948    21487226 :     for (; i < nSrcPixelCount; ++i)
    2949             :     {
    2950    11463596 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    2951    11463596 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    2952    11463596 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    2953             :     }
    2954    10023630 : }
    2955             : 
    2956             : /************************************************************************/
    2957             : /*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
    2958             : /************************************************************************/
    2959             : 
    2960             : template <>
    2961    10023600 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
    2962             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    2963             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2964             :     double &dfRes2, double &dfRes3)
    2965             : {
    2966    10023600 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2967             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2968             :         dfRes1, dfRes2, dfRes3);
    2969    10023600 : }
    2970             : 
    2971             : template <>
    2972          30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
    2973             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    2974             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    2975             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    2976             : {
    2977          30 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2978             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2979             :         dfRes1, dfRes2, dfRes3);
    2980          30 : }
    2981             : 
    2982             : /************************************************************************/
    2983             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
    2984             : /************************************************************************/
    2985             : 
    2986             : template <class T>
    2987     2173256 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    2988             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2989             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2990             :     double &dfRes2, double &dfRes3)
    2991             : {
    2992     2173256 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2993     2173019 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2994     2173119 :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2995     2173145 :     int i = 0;  // Use after for.
    2996     2176400 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2997             :     {
    2998             :         // Retrieve the pixel & accumulate.
    2999        3236 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3000        3236 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3001        3236 :         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3002        3236 :         const XMMReg4Double v_weight =
    3003        3236 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3004             : 
    3005        3236 :         v_acc1 += v_pixels1 * v_weight;
    3006        3236 :         v_acc2 += v_pixels2 * v_weight;
    3007        3236 :         v_acc3 += v_pixels3 * v_weight;
    3008             :     }
    3009             : 
    3010     2173170 :     dfRes1 = v_acc1.GetHorizSum();
    3011     2173005 :     dfRes2 = v_acc2.GetHorizSum();
    3012     2173052 :     dfRes3 = v_acc3.GetHorizSum();
    3013             : 
    3014     6494420 :     for (; i < nSrcPixelCount; ++i)
    3015             :     {
    3016     4321375 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3017     4321375 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3018     4321375 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3019             :     }
    3020     2173045 : }
    3021             : 
    3022             : /************************************************************************/
    3023             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
    3024             : /************************************************************************/
    3025             : 
    3026             : template <>
    3027     2106350 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
    3028             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3029             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3030             :     double &dfRes2, double &dfRes3)
    3031             : {
    3032     2106350 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3033             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3034             :         dfRes1, dfRes2, dfRes3);
    3035     2106360 : }
    3036             : 
    3037             : template <>
    3038       66764 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
    3039             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3040             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3041             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3042             : {
    3043       66764 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3044             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3045             :         dfRes1, dfRes2, dfRes3);
    3046       66958 : }
    3047             : 
    3048             : /************************************************************************/
    3049             : /*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
    3050             : /************************************************************************/
    3051             : 
    3052             : template <class T>
    3053    12204790 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3054             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3055             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3056             :     double &dfRes3)
    3057             : {
    3058    12204790 :     const XMMReg4Double v_weight =
    3059             :         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
    3060             : 
    3061             :     // Retrieve the pixel & accumulate.
    3062    12149390 :     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
    3063    12232740 :     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
    3064    12238650 :     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
    3065             : 
    3066    12242310 :     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
    3067    12150770 :     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
    3068    12173520 :     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
    3069             : 
    3070    12143270 :     dfRes1 = v_acc1.GetHorizSum();
    3071    12160310 :     dfRes2 = v_acc2.GetHorizSum();
    3072    12177470 :     dfRes3 = v_acc3.GetHorizSum();
    3073    12182550 : }
    3074             : 
    3075             : /************************************************************************/
    3076             : /*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
    3077             : /************************************************************************/
    3078             : 
    3079             : template <>
    3080     6635020 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
    3081             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3082             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3083             :     double &dfRes3)
    3084             : {
    3085     6635020 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3086             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3087             :         dfRes3);
    3088     6632800 : }
    3089             : 
    3090             : template <>
    3091     5572300 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
    3092             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3093             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
    3094             :     double &dfRes2, double &dfRes3)
    3095             : {
    3096     5572300 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3097             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3098             :         dfRes3);
    3099     5531470 : }
    3100             : 
    3101             : #endif  // USE_SSE2
    3102             : 
    3103             : /************************************************************************/
    3104             : /*                    GDALResampleChunk_Convolution()                   */
    3105             : /************************************************************************/
    3106             : 
    3107             : template <class T, class Twork, GDALDataType eWrkDataType>
    3108        3650 : static CPLErr GDALResampleChunk_ConvolutionT(
    3109             :     const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
    3110             :     FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
    3111             :     int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
    3112             : 
    3113             : {
    3114        3650 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    3115        3650 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    3116        3650 :     const double dfSrcXDelta = args.dfSrcXDelta;
    3117        3650 :     const double dfSrcYDelta = args.dfSrcYDelta;
    3118        3650 :     constexpr int nBands = 1;
    3119        3650 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    3120        3650 :     const int nChunkXOff = args.nChunkXOff;
    3121        3650 :     const int nChunkXSize = args.nChunkXSize;
    3122        3650 :     const int nChunkYOff = args.nChunkYOff;
    3123        3650 :     const int nChunkYSize = args.nChunkYSize;
    3124        3650 :     const int nDstXOff = args.nDstXOff;
    3125        3650 :     const int nDstXOff2 = args.nDstXOff2;
    3126        3650 :     const int nDstYOff = args.nDstYOff;
    3127        3650 :     const int nDstYOff2 = args.nDstYOff2;
    3128        3650 :     const bool bHasNoData = args.bHasNoData;
    3129        3650 :     double dfNoDataValue = args.dfNoDataValue;
    3130             : 
    3131        3650 :     if (!bHasNoData)
    3132        3595 :         dfNoDataValue = 0.0;
    3133        3650 :     const auto dstDataType = args.eOvrDataType;
    3134        3650 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
    3135        3641 :     const double dfReplacementVal =
    3136          46 :         bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
    3137             :                    : dfNoDataValue;
    3138             :     // cppcheck-suppress unreadVariable
    3139        3641 :     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
    3140        3631 :     const auto nNodataValueInt64 = static_cast<GInt64>(dfNoDataValue);
    3141        3631 :     constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
    3142             : 
    3143             :     // TODO: we should have some generic function to do this.
    3144        3631 :     Twork fDstMin = -std::numeric_limits<Twork>::max();
    3145        3631 :     Twork fDstMax = std::numeric_limits<Twork>::max();
    3146        3631 :     if (dstDataType == GDT_Byte)
    3147             :     {
    3148        2924 :         fDstMin = std::numeric_limits<GByte>::min();
    3149        2922 :         fDstMax = std::numeric_limits<GByte>::max();
    3150             :     }
    3151         709 :     else if (dstDataType == GDT_Int8)
    3152             :     {
    3153           1 :         fDstMin = std::numeric_limits<GInt8>::min();
    3154           1 :         fDstMax = std::numeric_limits<GInt8>::max();
    3155             :     }
    3156         708 :     else if (dstDataType == GDT_UInt16)
    3157             :     {
    3158         383 :         fDstMin = std::numeric_limits<GUInt16>::min();
    3159         382 :         fDstMax = std::numeric_limits<GUInt16>::max();
    3160             :     }
    3161         329 :     else if (dstDataType == GDT_Int16)
    3162             :     {
    3163         279 :         fDstMin = std::numeric_limits<GInt16>::min();
    3164         279 :         fDstMax = std::numeric_limits<GInt16>::max();
    3165             :     }
    3166          50 :     else if (dstDataType == GDT_UInt32)
    3167             :     {
    3168           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
    3169           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
    3170             :     }
    3171          49 :     else if (dstDataType == GDT_Int32)
    3172             :     {
    3173             :         // cppcheck-suppress unreadVariable
    3174           2 :         fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
    3175             :         // cppcheck-suppress unreadVariable
    3176           2 :         fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
    3177             :     }
    3178          47 :     else if (dstDataType == GDT_UInt64)
    3179             :     {
    3180             :         // cppcheck-suppress unreadVariable
    3181           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
    3182             :         // cppcheck-suppress unreadVariable
    3183           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
    3184             :     }
    3185          46 :     else if (dstDataType == GDT_Int64)
    3186             :     {
    3187             :         // cppcheck-suppress unreadVariable
    3188           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
    3189             :         // cppcheck-suppress unreadVariable
    3190           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
    3191             :     }
    3192             : 
    3193    27544448 :     auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
    3194             :                                nNodataValueInt64, dfNoDataValue,
    3195             :                                dfReplacementVal](Twork fVal)
    3196             :     {
    3197    14634400 :         if (!bHasNoData)
    3198    11408000 :             return fVal;
    3199             : 
    3200             :         // Clamp value before comparing to nodata: this is only needed for
    3201             :         // kernels with negative weights (Lanczos)
    3202     3226360 :         Twork fClamped = fVal;
    3203     3226360 :         if (fClamped < fDstMin)
    3204       12874 :             fClamped = fDstMin;
    3205     3213490 :         else if (fClamped > fDstMax)
    3206       12852 :             fClamped = fDstMax;
    3207     3226360 :         if (isIntegerDT)
    3208             :         {
    3209     3226370 :             if (nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
    3210             :             {
    3211             :                 // Do not use the nodata value
    3212       13869 :                 return static_cast<Twork>(dfReplacementVal);
    3213             :             }
    3214             :         }
    3215           0 :         else if (dfNoDataValue == fClamped)
    3216             :         {
    3217             :             // Do not use the nodata value
    3218           1 :             return static_cast<Twork>(dfReplacementVal);
    3219             :         }
    3220     3212490 :         return fClamped;
    3221             :     };
    3222             : 
    3223             :     /* -------------------------------------------------------------------- */
    3224             :     /*      Allocate work buffers.                                          */
    3225             :     /* -------------------------------------------------------------------- */
    3226        3631 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    3227        3631 :     Twork *pafWrkScanline = nullptr;
    3228        3631 :     if (dstDataType != eWrkDataType)
    3229             :     {
    3230             :         pafWrkScanline =
    3231        3592 :             static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
    3232        3601 :         if (pafWrkScanline == nullptr)
    3233           0 :             return CE_Failure;
    3234             :     }
    3235             : 
    3236        3640 :     const double dfXScale = 1.0 / dfXRatioDstToSrc;
    3237        3640 :     const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
    3238        3640 :     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
    3239        3640 :     const double dfYScale = 1.0 / dfYRatioDstToSrc;
    3240        3640 :     const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
    3241        3640 :     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
    3242             : 
    3243             :     // Temporary array to store result of horizontal filter.
    3244             :     double *padfHorizontalFiltered = static_cast<double *>(
    3245        3640 :         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
    3246             : 
    3247             :     // To store convolution coefficients.
    3248        3645 :     double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
    3249             :         static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
    3250             :                          0.5) *
    3251             :         sizeof(double)));
    3252             : 
    3253        3639 :     GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
    3254        3639 :     if (pabyChunkNodataMask)
    3255             :         pabyChunkNodataMaskHorizontalFiltered =
    3256         377 :             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
    3257        3639 :     if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
    3258         377 :         (pabyChunkNodataMask != nullptr &&
    3259             :          pabyChunkNodataMaskHorizontalFiltered == nullptr))
    3260             :     {
    3261           5 :         VSIFree(pafWrkScanline);
    3262           0 :         VSIFree(padfHorizontalFiltered);
    3263           0 :         VSIFreeAligned(padfWeights);
    3264           0 :         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3265           0 :         return CE_Failure;
    3266             :     }
    3267             : 
    3268             :     /* ==================================================================== */
    3269             :     /*      First pass: horizontal filter                                   */
    3270             :     /* ==================================================================== */
    3271        3634 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    3272             : #ifdef USE_SSE2
    3273        3634 :     bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
    3274             : #endif
    3275     2723275 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    3276             :     {
    3277     2719619 :         const double dfSrcPixel =
    3278     2719619 :             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
    3279     2719619 :         int nSrcPixelStart =
    3280     2719619 :             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
    3281     2719619 :         if (nSrcPixelStart < nChunkXOff)
    3282       55092 :             nSrcPixelStart = nChunkXOff;
    3283     2719619 :         int nSrcPixelStop =
    3284     2719619 :             static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
    3285     2719619 :         if (nSrcPixelStop > nChunkRightXOff)
    3286       55115 :             nSrcPixelStop = nChunkRightXOff;
    3287             : #if 0
    3288             :         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
    3289             :         {
    3290             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3291             :         }
    3292             :         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
    3293             :         {
    3294             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3295             :         }
    3296             : #endif
    3297     2719619 :         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
    3298     2719619 :         double dfWeightSum = 0.0;
    3299             : 
    3300             :         // Compute convolution coefficients.
    3301     2719619 :         int nSrcPixel = nSrcPixelStart;
    3302     2719619 :         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
    3303     3563017 :         for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
    3304             :         {
    3305      843637 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
    3306      843637 :             dfX += dfXScaleWeight;
    3307      843637 :             padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
    3308      843637 :             dfX += dfXScaleWeight;
    3309      843637 :             padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
    3310      843637 :             dfX += dfXScaleWeight;
    3311      843637 :             padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
    3312      843637 :             dfX += dfXScaleWeight;
    3313      843390 :             dfWeightSum +=
    3314      843637 :                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
    3315             :         }
    3316     6700731 :         for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
    3317             :         {
    3318     3981305 :             const double dfWeight = pfnFilterFunc(dfX);
    3319     3981348 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
    3320     3981348 :             dfWeightSum += dfWeight;
    3321             :         }
    3322             : 
    3323     2719426 :         const int nHeight = nChunkYSize * nBands;
    3324     2719426 :         if (pabyChunkNodataMask == nullptr)
    3325             :         {
    3326     2648612 :             if (dfWeightSum != 0)
    3327             :             {
    3328     2648622 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3329     9452133 :                 for (int i = 0; i < nSrcPixelCount; ++i)
    3330     6803505 :                     padfWeights[i] *= dfInvWeightSum;
    3331             :             }
    3332     2648612 :             int iSrcLineOff = 0;
    3333             : #ifdef USE_SSE2
    3334     2648612 :             if (nSrcPixelCount == 4)
    3335             :             {
    3336    13959446 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3337             :                 {
    3338    13427436 :                     const GPtrDiff_t j =
    3339    13427436 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3340    13427436 :                         (nSrcPixelStart - nChunkXOff);
    3341    13427436 :                     double dfVal1 = 0.0;
    3342    13427436 :                     double dfVal2 = 0.0;
    3343    13427436 :                     double dfVal3 = 0.0;
    3344    13427436 :                     GDALResampleConvolutionHorizontalPixelCount4_3rows(
    3345    13427436 :                         pChunk + j, pChunk + j + nChunkXSize,
    3346    13427436 :                         pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
    3347             :                         dfVal2, dfVal3);
    3348    13423926 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3349    13423926 :                                                nDstXSize +
    3350    13423926 :                                            iDstPixel - nDstXOff] = dfVal1;
    3351    13423926 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3352    13423926 :                                             1) *
    3353    13423926 :                                                nDstXSize +
    3354    13423926 :                                            iDstPixel - nDstXOff] = dfVal2;
    3355    13423926 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3356    13423926 :                                             2) *
    3357    13423926 :                                                nDstXSize +
    3358    13423926 :                                            iDstPixel - nDstXOff] = dfVal3;
    3359             :                 }
    3360             :             }
    3361     2113101 :             else if (bSrcPixelCountLess8)
    3362             :             {
    3363     4226204 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3364             :                 {
    3365     2191176 :                     const GPtrDiff_t j =
    3366     2191176 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3367     2191176 :                         (nSrcPixelStart - nChunkXOff);
    3368     2191176 :                     double dfVal1 = 0.0;
    3369     2191176 :                     double dfVal2 = 0.0;
    3370     2191176 :                     double dfVal3 = 0.0;
    3371     2191176 :                     GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    3372     2191176 :                         pChunk + j, pChunk + j + nChunkXSize,
    3373     2191176 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3374             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3375     2191495 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3376     2191495 :                                                nDstXSize +
    3377     2191495 :                                            iDstPixel - nDstXOff] = dfVal1;
    3378     2191495 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3379     2191495 :                                             1) *
    3380     2191495 :                                                nDstXSize +
    3381     2191495 :                                            iDstPixel - nDstXOff] = dfVal2;
    3382     2191495 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3383     2191495 :                                             2) *
    3384     2191495 :                                                nDstXSize +
    3385     2191495 :                                            iDstPixel - nDstXOff] = dfVal3;
    3386             :                 }
    3387             :             }
    3388             :             else
    3389             : #endif
    3390             :             {
    3391    10166842 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3392             :                 {
    3393    10088430 :                     const GPtrDiff_t j =
    3394    10088430 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3395    10088430 :                         (nSrcPixelStart - nChunkXOff);
    3396    10088430 :                     double dfVal1 = 0.0;
    3397    10088430 :                     double dfVal2 = 0.0;
    3398    10088430 :                     double dfVal3 = 0.0;
    3399    10088430 :                     GDALResampleConvolutionHorizontal_3rows(
    3400    10088430 :                         pChunk + j, pChunk + j + nChunkXSize,
    3401    10088430 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3402             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3403    10088430 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3404    10088430 :                                                nDstXSize +
    3405    10088430 :                                            iDstPixel - nDstXOff] = dfVal1;
    3406    10088430 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3407    10088430 :                                             1) *
    3408    10088430 :                                                nDstXSize +
    3409    10088430 :                                            iDstPixel - nDstXOff] = dfVal2;
    3410    10088430 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3411    10088430 :                                             2) *
    3412    10088430 :                                                nDstXSize +
    3413    10088430 :                                            iDstPixel - nDstXOff] = dfVal3;
    3414             :                 }
    3415             :             }
    3416     5428005 :             for (; iSrcLineOff < nHeight; ++iSrcLineOff)
    3417             :             {
    3418     2782344 :                 const GPtrDiff_t j =
    3419     2782344 :                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3420     2782344 :                     (nSrcPixelStart - nChunkXOff);
    3421     5520282 :                 const double dfVal = GDALResampleConvolutionHorizontal(
    3422     2782344 :                     pChunk + j, padfWeights, nSrcPixelCount);
    3423     2782583 :                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3424     2782583 :                                            nDstXSize +
    3425     2782583 :                                        iDstPixel - nDstXOff] = dfVal;
    3426             :             }
    3427             :         }
    3428             :         else
    3429             :         {
    3430    18280051 :             for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
    3431             :             {
    3432    18206118 :                 const GPtrDiff_t j =
    3433    18206118 :                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3434    18206118 :                     (nSrcPixelStart - nChunkXOff);
    3435             : 
    3436    18206118 :                 if (bKernelWithNegativeWeights)
    3437             :                 {
    3438    17725512 :                     int nConsecutiveValid = 0;
    3439    17725512 :                     int nMaxConsecutiveValid = 0;
    3440   164371458 :                     for (int k = 0; k < nSrcPixelCount; k++)
    3441             :                     {
    3442   146646146 :                         if (pabyChunkNodataMask[j + k])
    3443    40208853 :                             nConsecutiveValid++;
    3444   106436793 :                         else if (nConsecutiveValid)
    3445             :                         {
    3446       96592 :                             nMaxConsecutiveValid = std::max(
    3447       96592 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3448       96592 :                             nConsecutiveValid = 0;
    3449             :                         }
    3450             :                     }
    3451    17725512 :                     nMaxConsecutiveValid =
    3452    17725512 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3453    17725512 :                     if (nMaxConsecutiveValid < nSrcPixelCount / 2)
    3454             :                     {
    3455    12469807 :                         const size_t nTempOffset =
    3456    12469807 :                             static_cast<size_t>(iSrcLineOff) * nDstXSize +
    3457    12469807 :                             iDstPixel - nDstXOff;
    3458    12469807 :                         padfHorizontalFiltered[nTempOffset] = 0.0;
    3459    12469807 :                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3460    12469807 :                         continue;
    3461             :                     }
    3462             :                 }
    3463             : 
    3464     5736261 :                 double dfVal = 0.0;
    3465     5736261 :                 GDALResampleConvolutionHorizontalWithMask(
    3466     5736261 :                     pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3467             :                     nSrcPixelCount, dfVal, dfWeightSum);
    3468     5739430 :                 const size_t nTempOffset =
    3469     5739430 :                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
    3470     5739430 :                     nDstXOff;
    3471     5739430 :                 if (dfWeightSum > 0.0)
    3472             :                 {
    3473     5691828 :                     padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
    3474     5691828 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
    3475             :                 }
    3476             :                 else
    3477             :                 {
    3478       47595 :                     padfHorizontalFiltered[nTempOffset] = 0.0;
    3479       47595 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3480             :                 }
    3481             :             }
    3482             :         }
    3483             :     }
    3484             : 
    3485             :     /* ==================================================================== */
    3486             :     /*      Second pass: vertical filter                                    */
    3487             :     /* ==================================================================== */
    3488        3651 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    3489             : 
    3490      196900 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3491             :     {
    3492      193249 :         Twork *const pafDstScanline =
    3493      193249 :             pafWrkScanline ? pafWrkScanline
    3494        8421 :                            : static_cast<Twork *>(pDstBuffer) +
    3495        8421 :                                  (iDstLine - nDstYOff) * nDstXSize;
    3496             : 
    3497      193249 :         const double dfSrcLine =
    3498      193249 :             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
    3499      193249 :         int nSrcLineStart =
    3500      193249 :             static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
    3501      193249 :         int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
    3502      193249 :         if (nSrcLineStart < nChunkYOff)
    3503        2275 :             nSrcLineStart = nChunkYOff;
    3504      193249 :         if (nSrcLineStop > nChunkBottomYOff)
    3505        2311 :             nSrcLineStop = nChunkBottomYOff;
    3506             : #if 0
    3507             :         if( nSrcLineStart < nChunkYOff &&
    3508             :             nChunkYOff > 0 )
    3509             :         {
    3510             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3511             :         }
    3512             :         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
    3513             :         {
    3514             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3515             :         }
    3516             : #endif
    3517      193249 :         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
    3518      193249 :         double dfWeightSum = 0.0;
    3519             : 
    3520             :         // Compute convolution coefficients.
    3521      193249 :         int nSrcLine = nSrcLineStart;  // Used after for.
    3522      193249 :         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
    3523      428660 :         for (; nSrcLine + 3 < nSrcLineStop;
    3524      235411 :              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
    3525             :         {
    3526      235402 :             padfWeights[nSrcLine - nSrcLineStart] = dfY;
    3527      235402 :             padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
    3528      235402 :             padfWeights[nSrcLine + 2 - nSrcLineStart] =
    3529      235402 :                 dfY + 2 * dfYScaleWeight;
    3530      235402 :             padfWeights[nSrcLine + 3 - nSrcLineStart] =
    3531      235402 :                 dfY + 3 * dfYScaleWeight;
    3532      235411 :             dfWeightSum +=
    3533      235402 :                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
    3534             :         }
    3535      226006 :         for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
    3536             :         {
    3537       32772 :             const double dfWeight = pfnFilterFunc(dfY);
    3538       32748 :             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
    3539       32748 :             dfWeightSum += dfWeight;
    3540             :         }
    3541             : 
    3542      193234 :         if (pabyChunkNodataMask == nullptr)
    3543             :         {
    3544      159721 :             if (dfWeightSum != 0)
    3545             :             {
    3546      159726 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3547      899685 :                 for (int i = 0; i < nSrcLineCount; ++i)
    3548      739959 :                     padfWeights[i] *= dfInvWeightSum;
    3549             :             }
    3550             :         }
    3551             : 
    3552      193234 :         if (pabyChunkNodataMask == nullptr)
    3553             :         {
    3554      159731 :             int iFilteredPixelOff = 0;  // Used after for.
    3555             :             // j used after for.
    3556      159731 :             size_t j =
    3557      159731 :                 (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
    3558             : #ifdef USE_SSE2
    3559             :             if constexpr (eWrkDataType == GDT_Float32)
    3560             :             {
    3561             : #ifdef __AVX__
    3562             :                 for (; iFilteredPixelOff + 15 < nDstXSize;
    3563             :                      iFilteredPixelOff += 16, j += 16)
    3564             :                 {
    3565             :                     GDALResampleConvolutionVertical_16cols(
    3566             :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3567             :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3568             :                     if (bHasNoData)
    3569             :                     {
    3570             :                         for (int k = 0; k < 16; k++)
    3571             :                         {
    3572             :                             pafDstScanline[iFilteredPixelOff + k] =
    3573             :                                 replaceValIfNodata(
    3574             :                                     pafDstScanline[iFilteredPixelOff + k]);
    3575             :                         }
    3576             :                     }
    3577             :                 }
    3578             : #else
    3579    18739546 :                 for (; iFilteredPixelOff + 7 < nDstXSize;
    3580             :                      iFilteredPixelOff += 8, j += 8)
    3581             :                 {
    3582    18625610 :                     GDALResampleConvolutionVertical_8cols(
    3583    18625610 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3584    18625610 :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3585    18587070 :                     if (bHasNoData)
    3586             :                     {
    3587       17820 :                         for (int k = 0; k < 8; k++)
    3588             :                         {
    3589       15840 :                             pafDstScanline[iFilteredPixelOff + k] =
    3590       15840 :                                 replaceValIfNodata(
    3591       15840 :                                     pafDstScanline[iFilteredPixelOff + k]);
    3592             :                         }
    3593             :                     }
    3594             :                 }
    3595             : #endif
    3596             : 
    3597      577109 :                 for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
    3598             :                 {
    3599      463230 :                     const Twork fVal =
    3600      463149 :                         static_cast<Twork>(GDALResampleConvolutionVertical(
    3601      463149 :                             padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3602             :                             nSrcLineCount));
    3603      463155 :                     pafDstScanline[iFilteredPixelOff] =
    3604      463230 :                         replaceValIfNodata(fVal);
    3605             :                 }
    3606             :             }
    3607             :             else
    3608             : #endif
    3609             :             {
    3610     2887210 :                 for (; iFilteredPixelOff + 1 < nDstXSize;
    3611             :                      iFilteredPixelOff += 2, j += 2)
    3612             :                 {
    3613     2880000 :                     double dfVal1 = 0.0;
    3614     2880000 :                     double dfVal2 = 0.0;
    3615     2880000 :                     GDALResampleConvolutionVertical_2cols(
    3616     2880000 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3617             :                         nSrcLineCount, dfVal1, dfVal2);
    3618     5760010 :                     pafDstScanline[iFilteredPixelOff] =
    3619     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal1));
    3620     2880000 :                     pafDstScanline[iFilteredPixelOff + 1] =
    3621     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal2));
    3622             :                 }
    3623        7206 :                 if (iFilteredPixelOff < nDstXSize)
    3624             :                 {
    3625           2 :                     const double dfVal = GDALResampleConvolutionVertical(
    3626           2 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3627             :                         nSrcLineCount);
    3628           2 :                     pafDstScanline[iFilteredPixelOff] =
    3629           2 :                         replaceValIfNodata(static_cast<Twork>(dfVal));
    3630             :                 }
    3631             :             }
    3632             :         }
    3633             :         else
    3634             :         {
    3635    17284632 :             for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
    3636             :                  ++iFilteredPixelOff)
    3637             :             {
    3638    17251205 :                 double dfVal = 0.0;
    3639    17251205 :                 dfWeightSum = 0.0;
    3640    17251205 :                 size_t j = (nSrcLineStart - nChunkYOff) *
    3641    17251205 :                                static_cast<size_t>(nDstXSize) +
    3642    17251205 :                            iFilteredPixelOff;
    3643    17251205 :                 if (bKernelWithNegativeWeights)
    3644             :                 {
    3645    17026301 :                     int nConsecutiveValid = 0;
    3646    17026301 :                     int nMaxConsecutiveValid = 0;
    3647   121244321 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3648             :                     {
    3649   104218020 :                         const double dfWeight =
    3650   104218020 :                             padfWeights[i] *
    3651             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3652   104218020 :                         if (pabyChunkNodataMaskHorizontalFiltered[j])
    3653             :                         {
    3654    41787737 :                             nConsecutiveValid++;
    3655             :                         }
    3656    62429783 :                         else if (nConsecutiveValid)
    3657             :                         {
    3658      199248 :                             nMaxConsecutiveValid = std::max(
    3659      199248 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3660      199248 :                             nConsecutiveValid = 0;
    3661             :                         }
    3662   104218020 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3663   104218020 :                         dfWeightSum += dfWeight;
    3664             :                     }
    3665    17026301 :                     nMaxConsecutiveValid =
    3666    17026301 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3667    17026301 :                     if (nMaxConsecutiveValid < nSrcLineCount / 2)
    3668             :                     {
    3669     8839831 :                         pafDstScanline[iFilteredPixelOff] =
    3670     8839739 :                             static_cast<Twork>(dfNoDataValue);
    3671     8839831 :                         continue;
    3672             :                     }
    3673             :                 }
    3674             :                 else
    3675             :                 {
    3676     1130262 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3677             :                     {
    3678      905432 :                         const double dfWeight =
    3679      905432 :                             padfWeights[i] *
    3680             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3681      905432 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3682      905432 :                         dfWeightSum += dfWeight;
    3683             :                     }
    3684             :                 }
    3685     8411324 :                 if (dfWeightSum > 0.0)
    3686             :                 {
    3687     8395283 :                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
    3688     8395271 :                         static_cast<Twork>(dfVal / dfWeightSum));
    3689             :                 }
    3690             :                 else
    3691             :                 {
    3692       16039 :                     pafDstScanline[iFilteredPixelOff] =
    3693       16015 :                         static_cast<Twork>(dfNoDataValue);
    3694             :                 }
    3695             :             }
    3696             :         }
    3697             : 
    3698      154669 :         if (fMaxVal != 0.0f)
    3699             :         {
    3700      192324 :             for (int i = 0; i < nDstXSize; ++i)
    3701             :             {
    3702      192088 :                 if (pafDstScanline[i] > fMaxVal)
    3703       96022 :                     pafDstScanline[i] = fMaxVal;
    3704             :             }
    3705             :         }
    3706             : 
    3707      154669 :         if (pafWrkScanline)
    3708             :         {
    3709      184833 :             GDALCopyWords(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
    3710             :                           static_cast<GByte *>(pDstBuffer) +
    3711      184833 :                               static_cast<size_t>(iDstLine - nDstYOff) *
    3712      184833 :                                   nDstXSize * nDstDataTypeSize,
    3713             :                           dstDataType, nDstDataTypeSize, nDstXSize);
    3714             :         }
    3715             :     }
    3716             : 
    3717        3651 :     VSIFree(pafWrkScanline);
    3718        3651 :     VSIFreeAligned(padfWeights);
    3719        3651 :     VSIFree(padfHorizontalFiltered);
    3720        3651 :     VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3721             : 
    3722        3651 :     return CE_None;
    3723             : }
    3724             : 
    3725             : static CPLErr
    3726        3650 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
    3727             :                               const void *pChunk, void **ppDstBuffer,
    3728             :                               GDALDataType *peDstBufferDataType)
    3729             : {
    3730             :     GDALResampleAlg eResample;
    3731        3650 :     bool bKernelWithNegativeWeights = false;
    3732        3650 :     if (EQUAL(args.pszResampling, "BILINEAR"))
    3733        2579 :         eResample = GRA_Bilinear;
    3734        1071 :     else if (EQUAL(args.pszResampling, "CUBIC"))
    3735             :     {
    3736         991 :         eResample = GRA_Cubic;
    3737         991 :         bKernelWithNegativeWeights = true;
    3738             :     }
    3739          80 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    3740          23 :         eResample = GRA_CubicSpline;
    3741          57 :     else if (EQUAL(args.pszResampling, "LANCZOS"))
    3742             :     {
    3743          54 :         eResample = GRA_Lanczos;
    3744          54 :         bKernelWithNegativeWeights = true;
    3745             :     }
    3746             :     else
    3747             :     {
    3748           3 :         CPLAssert(false);
    3749             :         return CE_Failure;
    3750             :     }
    3751        3647 :     const int nKernelRadius = GWKGetFilterRadius(eResample);
    3752        3644 :     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
    3753             :     const FilterFunc4ValuesType pfnFilterFunc4Values =
    3754        3646 :         GWKGetFilterFunc4Values(eResample);
    3755             : 
    3756        3644 :     float fMaxVal = 0.f;
    3757             :     // Cubic, etc... can have overshoots, so make sure we clamp values to the
    3758             :     // maximum value if NBITS is set.
    3759        3644 :     if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
    3760           8 :         (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
    3761           0 :          args.eOvrDataType == GDT_UInt32))
    3762             :     {
    3763           8 :         int nBits = args.nOvrNBITS;
    3764           8 :         if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
    3765           1 :             nBits = 0;
    3766           8 :         if (nBits > 0 && nBits < 32)
    3767           7 :             fMaxVal = static_cast<float>((1U << nBits) - 1);
    3768             :     }
    3769             : 
    3770        3644 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    3771             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    3772             :         GDALGetDataTypeSizeBytes(args.eOvrDataType));
    3773        3648 :     if (*ppDstBuffer == nullptr)
    3774             :     {
    3775           0 :         return CE_Failure;
    3776             :     }
    3777        3648 :     *peDstBufferDataType = args.eOvrDataType;
    3778             : 
    3779        3648 :     switch (args.eWrkDataType)
    3780             :     {
    3781        2923 :         case GDT_Byte:
    3782             :         {
    3783        2923 :             return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
    3784             :                 args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
    3785             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3786        2925 :                 bKernelWithNegativeWeights, fMaxVal);
    3787             :         }
    3788             : 
    3789         395 :         case GDT_UInt16:
    3790             :         {
    3791         395 :             return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
    3792             :                 args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
    3793             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3794         396 :                 bKernelWithNegativeWeights, fMaxVal);
    3795             :         }
    3796             : 
    3797         301 :         case GDT_Float32:
    3798             :         {
    3799         301 :             return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
    3800             :                 args, static_cast<const float *>(pChunk), *ppDstBuffer,
    3801             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3802         301 :                 bKernelWithNegativeWeights, fMaxVal);
    3803             :         }
    3804             : 
    3805          29 :         case GDT_Float64:
    3806             :         {
    3807          29 :             return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
    3808             :                 args, static_cast<const double *>(pChunk), *ppDstBuffer,
    3809             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3810          29 :                 bKernelWithNegativeWeights, fMaxVal);
    3811             :         }
    3812             : 
    3813           0 :         default:
    3814           0 :             break;
    3815             :     }
    3816             : 
    3817           0 :     CPLAssert(false);
    3818             :     return CE_Failure;
    3819             : }
    3820             : 
    3821             : /************************************************************************/
    3822             : /*                       GDALResampleChunkC32R()                        */
    3823             : /************************************************************************/
    3824             : 
    3825           2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
    3826             :                                     const float *pafChunk, const int nChunkYOff,
    3827             :                                     const int nChunkYSize, const int nDstYOff,
    3828             :                                     const int nDstYOff2, const int nOvrXSize,
    3829             :                                     const int nOvrYSize, void **ppDstBuffer,
    3830             :                                     GDALDataType *peDstBufferDataType,
    3831             :                                     const char *pszResampling)
    3832             : 
    3833             : {
    3834             :     enum Method
    3835             :     {
    3836             :         NEAR,
    3837             :         AVERAGE,
    3838             :         AVERAGE_MAGPHASE,
    3839             :         RMS,
    3840             :     };
    3841             : 
    3842           2 :     Method eMethod = NEAR;
    3843           2 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    3844             :     {
    3845           0 :         eMethod = NEAR;
    3846             :     }
    3847           2 :     else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
    3848             :     {
    3849           0 :         eMethod = AVERAGE_MAGPHASE;
    3850             :     }
    3851           2 :     else if (EQUAL(pszResampling, "RMS"))
    3852             :     {
    3853           2 :         eMethod = RMS;
    3854             :     }
    3855           0 :     else if (STARTS_WITH_CI(pszResampling, "AVER"))
    3856             :     {
    3857           0 :         eMethod = AVERAGE;
    3858             :     }
    3859             :     else
    3860             :     {
    3861           0 :         CPLError(
    3862             :             CE_Failure, CPLE_NotSupported,
    3863             :             "Resampling method %s is not supported for complex data types. "
    3864             :             "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
    3865             :             pszResampling);
    3866           0 :         return CE_Failure;
    3867             :     }
    3868             : 
    3869           2 :     const int nOXSize = nOvrXSize;
    3870           2 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
    3871             :                                        GDALGetDataTypeSizeBytes(GDT_CFloat32));
    3872           2 :     if (*ppDstBuffer == nullptr)
    3873             :     {
    3874           0 :         return CE_Failure;
    3875             :     }
    3876           2 :     float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
    3877           2 :     *peDstBufferDataType = GDT_CFloat32;
    3878             : 
    3879           2 :     const int nOYSize = nOvrYSize;
    3880           2 :     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
    3881           2 :     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
    3882             : 
    3883             :     /* ==================================================================== */
    3884             :     /*      Loop over destination scanlines.                                */
    3885             :     /* ==================================================================== */
    3886           8 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3887             :     {
    3888           6 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    3889           6 :         if (nSrcYOff < nChunkYOff)
    3890           0 :             nSrcYOff = nChunkYOff;
    3891             : 
    3892           6 :         int nSrcYOff2 =
    3893           6 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
    3894           6 :         if (nSrcYOff2 == nSrcYOff)
    3895           0 :             nSrcYOff2++;
    3896             : 
    3897           6 :         if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
    3898             :         {
    3899           2 :             if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
    3900           0 :                 nSrcYOff = nSrcHeight - 1;
    3901           2 :             nSrcYOff2 = nSrcHeight;
    3902             :         }
    3903           6 :         if (nSrcYOff2 > nChunkYOff + nChunkYSize)
    3904           0 :             nSrcYOff2 = nChunkYOff + nChunkYSize;
    3905             : 
    3906           6 :         const float *const pafSrcScanline =
    3907           6 :             pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
    3908           6 :         float *const pafDstScanline =
    3909           6 :             pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
    3910             : 
    3911             :         /* --------------------------------------------------------------------
    3912             :          */
    3913             :         /*      Loop over destination pixels */
    3914             :         /* --------------------------------------------------------------------
    3915             :          */
    3916          18 :         for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
    3917             :         {
    3918          12 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    3919          12 :             int nSrcXOff2 =
    3920          12 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
    3921          12 :             if (nSrcXOff2 == nSrcXOff)
    3922           0 :                 nSrcXOff2++;
    3923          12 :             if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
    3924             :             {
    3925           6 :                 if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
    3926           0 :                     nSrcXOff = nSrcWidth - 1;
    3927           6 :                 nSrcXOff2 = nSrcWidth;
    3928             :             }
    3929             : 
    3930          12 :             if (eMethod == NEAR)
    3931             :             {
    3932           0 :                 pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
    3933           0 :                 pafDstScanline[iDstPixel * 2 + 1] =
    3934           0 :                     pafSrcScanline[nSrcXOff * 2 + 1];
    3935             :             }
    3936          12 :             else if (eMethod == AVERAGE_MAGPHASE)
    3937             :             {
    3938           0 :                 double dfTotalR = 0.0;
    3939           0 :                 double dfTotalI = 0.0;
    3940           0 :                 double dfTotalM = 0.0;
    3941           0 :                 int nCount = 0;
    3942             : 
    3943           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    3944             :                 {
    3945           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    3946             :                     {
    3947           0 :                         const double dfR =
    3948           0 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    3949           0 :                                                         iY - nSrcYOff) *
    3950           0 :                                                         nSrcWidth * 2];
    3951           0 :                         const double dfI =
    3952           0 :                             pafSrcScanline[iX * 2 +
    3953           0 :                                            static_cast<GPtrDiff_t>(iY -
    3954           0 :                                                                    nSrcYOff) *
    3955           0 :                                                nSrcWidth * 2 +
    3956           0 :                                            1];
    3957           0 :                         dfTotalR += dfR;
    3958           0 :                         dfTotalI += dfI;
    3959           0 :                         dfTotalM += std::hypot(dfR, dfI);
    3960           0 :                         ++nCount;
    3961             :                     }
    3962             :                 }
    3963             : 
    3964           0 :                 CPLAssert(nCount > 0);
    3965           0 :                 if (nCount == 0)
    3966             :                 {
    3967           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    3968           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    3969             :                 }
    3970             :                 else
    3971             :                 {
    3972           0 :                     pafDstScanline[iDstPixel * 2] =
    3973           0 :                         static_cast<float>(dfTotalR / nCount);
    3974           0 :                     pafDstScanline[iDstPixel * 2 + 1] =
    3975           0 :                         static_cast<float>(dfTotalI / nCount);
    3976             :                     const double dfM =
    3977           0 :                         std::hypot(pafDstScanline[iDstPixel * 2],
    3978           0 :                                    pafDstScanline[iDstPixel * 2 + 1]);
    3979           0 :                     const double dfDesiredM = dfTotalM / nCount;
    3980           0 :                     double dfRatio = 1.0;
    3981           0 :                     if (dfM != 0.0)
    3982           0 :                         dfRatio = dfDesiredM / dfM;
    3983             : 
    3984           0 :                     pafDstScanline[iDstPixel * 2] *=
    3985           0 :                         static_cast<float>(dfRatio);
    3986           0 :                     pafDstScanline[iDstPixel * 2 + 1] *=
    3987           0 :                         static_cast<float>(dfRatio);
    3988             :                 }
    3989             :             }
    3990          12 :             else if (eMethod == RMS)
    3991             :             {
    3992          12 :                 double dfTotalR = 0.0;
    3993          12 :                 double dfTotalI = 0.0;
    3994          12 :                 int nCount = 0;
    3995             : 
    3996          36 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    3997             :                 {
    3998          72 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    3999             :                     {
    4000          48 :                         const double dfR =
    4001          48 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    4002          48 :                                                         iY - nSrcYOff) *
    4003          48 :                                                         nSrcWidth * 2];
    4004          48 :                         const double dfI =
    4005          48 :                             pafSrcScanline[iX * 2 +
    4006          48 :                                            static_cast<GPtrDiff_t>(iY -
    4007          48 :                                                                    nSrcYOff) *
    4008          48 :                                                nSrcWidth * 2 +
    4009          48 :                                            1];
    4010             : 
    4011          48 :                         dfTotalR += SQUARE(dfR);
    4012          48 :                         dfTotalI += SQUARE(dfI);
    4013             : 
    4014          48 :                         ++nCount;
    4015             :                     }
    4016             :                 }
    4017             : 
    4018          12 :                 CPLAssert(nCount > 0);
    4019          12 :                 if (nCount == 0)
    4020             :                 {
    4021           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    4022           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    4023             :                 }
    4024             :                 else
    4025             :                 {
    4026             :                     /* compute RMS */
    4027          12 :                     pafDstScanline[iDstPixel * 2] =
    4028          12 :                         static_cast<float>(sqrt(dfTotalR / nCount));
    4029          12 :                     pafDstScanline[iDstPixel * 2 + 1] =
    4030          12 :                         static_cast<float>(sqrt(dfTotalI / nCount));
    4031             :                 }
    4032             :             }
    4033           0 :             else if (eMethod == AVERAGE)
    4034             :             {
    4035           0 :                 double dfTotalR = 0.0;
    4036           0 :                 double dfTotalI = 0.0;
    4037           0 :                 int nCount = 0;
    4038             : 
    4039           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4040             :                 {
    4041           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4042             :                     {
    4043             :                         // TODO(schwehr): Maybe use std::complex?
    4044           0 :                         dfTotalR +=
    4045           0 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    4046           0 :                                                         iY - nSrcYOff) *
    4047           0 :                                                         nSrcWidth * 2];
    4048           0 :                         dfTotalI += pafSrcScanline[iX * 2 +
    4049           0 :                                                    static_cast<GPtrDiff_t>(
    4050           0 :                                                        iY - nSrcYOff) *
    4051           0 :                                                        nSrcWidth * 2 +
    4052           0 :                                                    1];
    4053           0 :                         ++nCount;
    4054             :                     }
    4055             :                 }
    4056             : 
    4057           0 :                 CPLAssert(nCount > 0);
    4058           0 :                 if (nCount == 0)
    4059             :                 {
    4060           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    4061           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    4062             :                 }
    4063             :                 else
    4064             :                 {
    4065           0 :                     pafDstScanline[iDstPixel * 2] =
    4066           0 :                         static_cast<float>(dfTotalR / nCount);
    4067           0 :                     pafDstScanline[iDstPixel * 2 + 1] =
    4068           0 :                         static_cast<float>(dfTotalI / nCount);
    4069             :                 }
    4070             :             }
    4071             :         }
    4072             :     }
    4073             : 
    4074           2 :     return CE_None;
    4075             : }
    4076             : 
    4077             : /************************************************************************/
    4078             : /*                  GDALRegenerateCascadingOverviews()                  */
    4079             : /*                                                                      */
    4080             : /*      Generate a list of overviews in order from largest to           */
    4081             : /*      smallest, computing each from the next larger.                  */
    4082             : /************************************************************************/
    4083             : 
    4084          42 : static CPLErr GDALRegenerateCascadingOverviews(
    4085             :     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
    4086             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    4087             :     void *pProgressData, CSLConstList papszOptions)
    4088             : 
    4089             : {
    4090             :     /* -------------------------------------------------------------------- */
    4091             :     /*      First, we must put the overviews in order from largest to       */
    4092             :     /*      smallest.                                                       */
    4093             :     /* -------------------------------------------------------------------- */
    4094         120 :     for (int i = 0; i < nOverviews - 1; ++i)
    4095             :     {
    4096         270 :         for (int j = 0; j < nOverviews - i - 1; ++j)
    4097             :         {
    4098         192 :             if (papoOvrBands[j]->GetXSize() *
    4099         192 :                     static_cast<float>(papoOvrBands[j]->GetYSize()) <
    4100         192 :                 papoOvrBands[j + 1]->GetXSize() *
    4101         192 :                     static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
    4102             :             {
    4103           0 :                 GDALRasterBand *poTempBand = papoOvrBands[j];
    4104           0 :                 papoOvrBands[j] = papoOvrBands[j + 1];
    4105           0 :                 papoOvrBands[j + 1] = poTempBand;
    4106             :             }
    4107             :         }
    4108             :     }
    4109             : 
    4110             :     /* -------------------------------------------------------------------- */
    4111             :     /*      Count total pixels so we can prepare appropriate scaled         */
    4112             :     /*      progress functions.                                             */
    4113             :     /* -------------------------------------------------------------------- */
    4114          42 :     double dfTotalPixels = 0.0;
    4115             : 
    4116         162 :     for (int i = 0; i < nOverviews; ++i)
    4117             :     {
    4118         120 :         dfTotalPixels += papoOvrBands[i]->GetXSize() *
    4119         120 :                          static_cast<double>(papoOvrBands[i]->GetYSize());
    4120             :     }
    4121             : 
    4122             :     /* -------------------------------------------------------------------- */
    4123             :     /*      Generate all the bands.                                         */
    4124             :     /* -------------------------------------------------------------------- */
    4125          42 :     double dfPixelsProcessed = 0.0;
    4126             : 
    4127         162 :     for (int i = 0; i < nOverviews; ++i)
    4128             :     {
    4129         120 :         GDALRasterBand *poBaseBand = poSrcBand;
    4130         120 :         if (i != 0)
    4131          78 :             poBaseBand = papoOvrBands[i - 1];
    4132             : 
    4133         120 :         double dfPixels = papoOvrBands[i]->GetXSize() *
    4134         120 :                           static_cast<double>(papoOvrBands[i]->GetYSize());
    4135             : 
    4136         240 :         void *pScaledProgressData = GDALCreateScaledProgress(
    4137             :             dfPixelsProcessed / dfTotalPixels,
    4138         120 :             (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
    4139             :             pProgressData);
    4140             : 
    4141         240 :         const CPLErr eErr = GDALRegenerateOverviewsEx(
    4142             :             poBaseBand, 1,
    4143         120 :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
    4144             :             pszResampling, GDALScaledProgress, pScaledProgressData,
    4145             :             papszOptions);
    4146         120 :         GDALDestroyScaledProgress(pScaledProgressData);
    4147             : 
    4148         120 :         if (eErr != CE_None)
    4149           0 :             return eErr;
    4150             : 
    4151         120 :         dfPixelsProcessed += dfPixels;
    4152             : 
    4153             :         // Only do the bit2grayscale promotion on the base band.
    4154         120 :         if (STARTS_WITH_CI(pszResampling,
    4155             :                            "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
    4156           8 :             pszResampling = "AVERAGE";
    4157             :     }
    4158             : 
    4159          42 :     return CE_None;
    4160             : }
    4161             : 
    4162             : /************************************************************************/
    4163             : /*                    GDALGetResampleFunction()                         */
    4164             : /************************************************************************/
    4165             : 
    4166        3839 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
    4167             :                                              int *pnRadius)
    4168             : {
    4169        3839 :     if (pnRadius)
    4170        3839 :         *pnRadius = 0;
    4171        3839 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4172         425 :         return GDALResampleChunk_Near;
    4173        3414 :     else if (STARTS_WITH_CI(pszResampling, "AVER") ||
    4174        2889 :              EQUAL(pszResampling, "RMS"))
    4175         553 :         return GDALResampleChunk_AverageOrRMS;
    4176        2861 :     else if (EQUAL(pszResampling, "GAUSS"))
    4177             :     {
    4178          26 :         if (pnRadius)
    4179          26 :             *pnRadius = 1;
    4180          26 :         return GDALResampleChunk_Gauss;
    4181             :     }
    4182        2835 :     else if (EQUAL(pszResampling, "MODE"))
    4183          96 :         return GDALResampleChunk_Mode;
    4184        2739 :     else if (EQUAL(pszResampling, "CUBIC"))
    4185             :     {
    4186         377 :         if (pnRadius)
    4187         375 :             *pnRadius = GWKGetFilterRadius(GRA_Cubic);
    4188         375 :         return GDALResampleChunk_Convolution;
    4189             :     }
    4190        2362 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    4191             :     {
    4192           3 :         if (pnRadius)
    4193           3 :             *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
    4194           3 :         return GDALResampleChunk_Convolution;
    4195             :     }
    4196        2359 :     else if (EQUAL(pszResampling, "LANCZOS"))
    4197             :     {
    4198           8 :         if (pnRadius)
    4199           8 :             *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
    4200           8 :         return GDALResampleChunk_Convolution;
    4201             :     }
    4202        2351 :     else if (EQUAL(pszResampling, "BILINEAR"))
    4203             :     {
    4204        2357 :         if (pnRadius)
    4205        2357 :             *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
    4206        2357 :         return GDALResampleChunk_Convolution;
    4207             :     }
    4208             :     else
    4209             :     {
    4210           0 :         CPLError(
    4211             :             CE_Failure, CPLE_AppDefined,
    4212             :             "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
    4213             :             pszResampling);
    4214           0 :         return nullptr;
    4215             :     }
    4216             : }
    4217             : 
    4218             : /************************************************************************/
    4219             : /*                      GDALGetOvrWorkDataType()                        */
    4220             : /************************************************************************/
    4221             : 
    4222        3729 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
    4223             :                                     GDALDataType eSrcDataType)
    4224             : {
    4225        3729 :     if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
    4226             :     {
    4227         511 :         return eSrcDataType;
    4228             :     }
    4229        3218 :     else if (eSrcDataType == GDT_Byte &&
    4230        2907 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4231        2450 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4232        2247 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4233        2244 :               EQUAL(pszResampling, "LANCZOS") ||
    4234        2239 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4235             :     {
    4236        2900 :         return GDT_Byte;
    4237             :     }
    4238         318 :     else if (eSrcDataType == GDT_UInt16 &&
    4239         119 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4240         108 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4241           3 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4242           3 :               EQUAL(pszResampling, "LANCZOS") ||
    4243           2 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4244             :     {
    4245         102 :         return GDT_UInt16;
    4246             :     }
    4247         216 :     else if (EQUAL(pszResampling, "GAUSS"))
    4248          20 :         return GDT_Float64;
    4249             : 
    4250         196 :     if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
    4251         184 :         eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
    4252             :         eSrcDataType == GDT_Float32)
    4253             :     {
    4254         160 :         return GDT_Float32;
    4255             :     }
    4256          36 :     return GDT_Float64;
    4257             : }
    4258             : 
    4259             : namespace
    4260             : {
    4261             : // Structure to hold a pointer to free with CPLFree()
    4262             : struct PointerHolder
    4263             : {
    4264             :     void *ptr = nullptr;
    4265             : 
    4266       34637 :     explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
    4267             :     {
    4268       34637 :     }
    4269             : 
    4270       34646 :     ~PointerHolder()
    4271       34646 :     {
    4272       34646 :         CPLFree(ptr);
    4273       34646 :     }
    4274             : 
    4275             :     PointerHolder(const PointerHolder &) = delete;
    4276             :     PointerHolder &operator=(const PointerHolder &) = delete;
    4277             : };
    4278             : }  // namespace
    4279             : 
    4280             : /************************************************************************/
    4281             : /*                      GDALRegenerateOverviews()                       */
    4282             : /************************************************************************/
    4283             : 
    4284             : /**
    4285             :  * \brief Generate downsampled overviews.
    4286             :  *
    4287             :  * This function will generate one or more overview images from a base image
    4288             :  * using the requested downsampling algorithm.  Its primary use is for
    4289             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4290             :  * used to generate downsampled images in one file from another outside the
    4291             :  * overview architecture.
    4292             :  *
    4293             :  * The output bands need to exist in advance.
    4294             :  *
    4295             :  * The full set of resampling algorithms is documented in
    4296             :  * GDALDataset::BuildOverviews().
    4297             :  *
    4298             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4299             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4300             :  * considered as the nodata value and not each value of the triplet
    4301             :  * independently per band.
    4302             :  *
    4303             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4304             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4305             :  * overview computation.
    4306             :  *
    4307             :  * @param hSrcBand the source (base level) band.
    4308             :  * @param nOverviewCount the number of downsampled bands being generated.
    4309             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4310             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4311             :  * @param pfnProgress progress report function.
    4312             :  * @param pProgressData progress function callback data.
    4313             :  * @return CE_None on success or CE_Failure on failure.
    4314             :  */
    4315         252 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
    4316             :                                GDALRasterBandH *pahOvrBands,
    4317             :                                const char *pszResampling,
    4318             :                                GDALProgressFunc pfnProgress,
    4319             :                                void *pProgressData)
    4320             : 
    4321             : {
    4322         252 :     return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
    4323             :                                      pszResampling, pfnProgress, pProgressData,
    4324         252 :                                      nullptr);
    4325             : }
    4326             : 
    4327             : /************************************************************************/
    4328             : /*                     GDALRegenerateOverviewsEx()                      */
    4329             : /************************************************************************/
    4330             : 
    4331             : /**
    4332             :  * \brief Generate downsampled overviews.
    4333             :  *
    4334             :  * This function will generate one or more overview images from a base image
    4335             :  * using the requested downsampling algorithm.  Its primary use is for
    4336             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4337             :  * used to generate downsampled images in one file from another outside the
    4338             :  * overview architecture.
    4339             :  *
    4340             :  * The output bands need to exist in advance.
    4341             :  *
    4342             :  * The full set of resampling algorithms is documented in
    4343             :  * GDALDataset::BuildOverviews().
    4344             :  *
    4345             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4346             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4347             :  * considered as the nodata value and not each value of the triplet
    4348             :  * independently per band.
    4349             :  *
    4350             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4351             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4352             :  * overview computation.
    4353             :  *
    4354             :  * @param hSrcBand the source (base level) band.
    4355             :  * @param nOverviewCount the number of downsampled bands being generated.
    4356             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4357             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4358             :  * @param pfnProgress progress report function.
    4359             :  * @param pProgressData progress function callback data.
    4360             :  * @param papszOptions NULL terminated list of options as key=value pairs, or
    4361             :  * NULL
    4362             :  * @return CE_None on success or CE_Failure on failure.
    4363             :  * @since GDAL 3.6
    4364             :  */
    4365         806 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
    4366             :                                  GDALRasterBandH *pahOvrBands,
    4367             :                                  const char *pszResampling,
    4368             :                                  GDALProgressFunc pfnProgress,
    4369             :                                  void *pProgressData, CSLConstList papszOptions)
    4370             : 
    4371             : {
    4372         806 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4373         806 :     GDALRasterBand **papoOvrBands =
    4374             :         reinterpret_cast<GDALRasterBand **>(pahOvrBands);
    4375             : 
    4376         806 :     if (pfnProgress == nullptr)
    4377         252 :         pfnProgress = GDALDummyProgress;
    4378             : 
    4379         806 :     if (EQUAL(pszResampling, "NONE"))
    4380          61 :         return CE_None;
    4381             : 
    4382         745 :     int nKernelRadius = 0;
    4383             :     GDALResampleFunction pfnResampleFn =
    4384         745 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4385             : 
    4386         745 :     if (pfnResampleFn == nullptr)
    4387           0 :         return CE_Failure;
    4388             : 
    4389             :     /* -------------------------------------------------------------------- */
    4390             :     /*      Check color tables...                                           */
    4391             :     /* -------------------------------------------------------------------- */
    4392         745 :     GDALColorTable *poColorTable = nullptr;
    4393             : 
    4394         384 :     if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4395        1564 :          EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
    4396         446 :         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4397             :     {
    4398           9 :         poColorTable = poSrcBand->GetColorTable();
    4399           9 :         if (poColorTable != nullptr)
    4400             :         {
    4401           9 :             if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
    4402             :             {
    4403           0 :                 CPLError(CE_Warning, CPLE_AppDefined,
    4404             :                          "Computing overviews on palette index raster bands "
    4405             :                          "with a palette whose color interpretation is not RGB "
    4406             :                          "will probably lead to unexpected results.");
    4407           0 :                 poColorTable = nullptr;
    4408             :             }
    4409           9 :             else if (poColorTable->IsIdentity())
    4410             :             {
    4411           0 :                 poColorTable = nullptr;
    4412             :             }
    4413             :         }
    4414             :         else
    4415             :         {
    4416           0 :             CPLError(CE_Warning, CPLE_AppDefined,
    4417             :                      "Computing overviews on palette index raster bands "
    4418             :                      "without a palette will probably lead to unexpected "
    4419             :                      "results.");
    4420             :         }
    4421             :     }
    4422             :     // Not ready yet
    4423        2154 :     else if ((EQUAL(pszResampling, "CUBIC") ||
    4424         682 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4425         682 :               EQUAL(pszResampling, "LANCZOS") ||
    4426        1475 :               EQUAL(pszResampling, "BILINEAR")) &&
    4427          57 :              poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4428             :     {
    4429           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4430             :                  "Computing %s overviews on palette index raster bands "
    4431             :                  "will probably lead to unexpected results.",
    4432             :                  pszResampling);
    4433             :     }
    4434             : 
    4435             :     // If we have a nodata mask and we are doing something more complicated
    4436             :     // than nearest neighbouring, we have to fetch to nodata mask.
    4437             : 
    4438         745 :     GDALRasterBand *poMaskBand = nullptr;
    4439         745 :     bool bUseNoDataMask = false;
    4440         745 :     bool bCanUseCascaded = true;
    4441             : 
    4442         745 :     if (!STARTS_WITH_CI(pszResampling, "NEAR"))
    4443             :     {
    4444             :         // Special case if we are an alpha/mask band. We want it to be
    4445             :         // considered as the mask band to avoid alpha=0 to be taken into account
    4446             :         // in average computation.
    4447         503 :         if (poSrcBand->IsMaskBand())
    4448             :         {
    4449          90 :             poMaskBand = poSrcBand;
    4450          90 :             bUseNoDataMask = true;
    4451             :         }
    4452             :         else
    4453             :         {
    4454         413 :             poMaskBand = poSrcBand->GetMaskBand();
    4455         413 :             const int nMaskFlags = poSrcBand->GetMaskFlags();
    4456         413 :             bCanUseCascaded =
    4457         413 :                 (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
    4458         413 :             bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
    4459             :         }
    4460             :     }
    4461             : 
    4462             :     /* -------------------------------------------------------------------- */
    4463             :     /*      If we are operating on multiple overviews, and using            */
    4464             :     /*      averaging, lets do them in cascading order to reduce the        */
    4465             :     /*      amount of computation.                                          */
    4466             :     /* -------------------------------------------------------------------- */
    4467             : 
    4468             :     // In case the mask made be computed from another band of the dataset,
    4469             :     // we can't use cascaded generation, as the computation of the overviews
    4470             :     // of the band used for the mask band may not have yet occurred (#3033).
    4471         745 :     if ((STARTS_WITH_CI(pszResampling, "AVER") ||
    4472         384 :          EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
    4473         353 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    4474         299 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    4475         745 :          EQUAL(pszResampling, "MODE")) &&
    4476          42 :         nOverviewCount > 1 && bCanUseCascaded)
    4477          42 :         return GDALRegenerateCascadingOverviews(
    4478             :             poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
    4479          42 :             pProgressData, papszOptions);
    4480             : 
    4481             :     /* -------------------------------------------------------------------- */
    4482             :     /*      Setup one horizontal swath to read from the raw buffer.         */
    4483             :     /* -------------------------------------------------------------------- */
    4484         703 :     int nFRXBlockSize = 0;
    4485         703 :     int nFRYBlockSize = 0;
    4486         703 :     poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
    4487             : 
    4488         703 :     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
    4489        1164 :     const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
    4490        1118 :                                        EQUAL(pszResampling, "MODE") ||
    4491         415 :                                        !GDALDataTypeIsComplex(eSrcDataType);
    4492             :     const GDALDataType eWrkDataType =
    4493             :         bUseGenericResampleFn
    4494         703 :             ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
    4495         703 :             : GDT_CFloat32;
    4496             : 
    4497         703 :     const int nWidth = poSrcBand->GetXSize();
    4498         703 :     const int nHeight = poSrcBand->GetYSize();
    4499             : 
    4500         703 :     int nMaxOvrFactor = 1;
    4501        1487 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    4502             :     {
    4503         784 :         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
    4504         784 :         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
    4505         784 :         nMaxOvrFactor = std::max(
    4506             :             nMaxOvrFactor,
    4507         784 :             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
    4508         784 :         nMaxOvrFactor = std::max(
    4509             :             nMaxOvrFactor,
    4510         784 :             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
    4511             :     }
    4512             : 
    4513         703 :     int nFullResYChunk = nFRYBlockSize;
    4514         703 :     int nMaxChunkYSizeQueried = 0;
    4515             : 
    4516             :     const auto UpdateChunkHeightAndGetChunkSize =
    4517        9137 :         [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
    4518       27411 :          eWrkDataType, nWidth]()
    4519             :     {
    4520             :         // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
    4521             :         // + nFullResYChunk) / nMaxOvrFactor)
    4522        9137 :         nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
    4523        9137 :         nMaxChunkYSizeQueried =
    4524        9137 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    4525        9137 :         return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
    4526        9137 :                nMaxChunkYSizeQueried * nWidth;
    4527         703 :     };
    4528             : 
    4529             :     // Only configurable for debug / testing
    4530             :     const char *pszChunkYSize =
    4531         703 :         CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
    4532         703 :     if (pszChunkYSize)
    4533             :     {
    4534             :         // coverity[tainted_data]
    4535           0 :         nFullResYChunk = atoi(pszChunkYSize);
    4536             :     }
    4537             : 
    4538             :     // Only configurable for debug / testing
    4539             :     const int nChunkMaxSize =
    4540         703 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    4541             : 
    4542         703 :     auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4543         703 :     if (nChunkSize > nChunkMaxSize)
    4544             :     {
    4545           3 :         if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
    4546           9 :             !GDALDataTypeIsComplex(eSrcDataType) &&
    4547           3 :             (!STARTS_WITH_CI(pszResampling, "AVER") ||
    4548           0 :              EQUAL(pszResampling, "AVERAGE")))
    4549             :         {
    4550             :             // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
    4551             :             // which use a block based strategy, which is much less memory
    4552             :             // hungry.
    4553           3 :             return GDALRegenerateOverviewsMultiBand(
    4554             :                 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
    4555           3 :                 pfnProgress, pProgressData, papszOptions);
    4556             :         }
    4557           0 :         else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
    4558             :         {
    4559           0 :             return GDALRegenerateCascadingOverviews(
    4560             :                 poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
    4561           0 :                 pfnProgress, pProgressData, papszOptions);
    4562             :         }
    4563             :     }
    4564         700 :     else if (pszChunkYSize == nullptr)
    4565             :     {
    4566             :         // Try to get as close as possible to nChunkMaxSize
    4567        9134 :         while (nChunkSize * 2 < nChunkMaxSize)
    4568             :         {
    4569        8434 :             nFullResYChunk *= 2;
    4570        8434 :             nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4571             :         }
    4572             :     }
    4573             : 
    4574         700 :     int nHasNoData = 0;
    4575         700 :     const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
    4576         700 :     const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    4577             :     const bool bPropagateNoData =
    4578         700 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    4579             : 
    4580             :     // Structure describing a resampling job
    4581             :     struct OvrJob
    4582             :     {
    4583             :         // Buffers to free when job is finished
    4584             :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
    4585             :         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
    4586             :         std::unique_ptr<PointerHolder> oDstBufferHolder{};
    4587             : 
    4588             :         GDALRasterBand *poDstBand = nullptr;
    4589             : 
    4590             :         // Input parameters of pfnResampleFn
    4591             :         GDALResampleFunction pfnResampleFn = nullptr;
    4592             :         int nSrcWidth = 0;
    4593             :         int nSrcHeight = 0;
    4594             :         int nDstWidth = 0;
    4595             :         GDALOverviewResampleArgs args{};
    4596             :         const void *pChunk = nullptr;
    4597             :         bool bUseGenericResampleFn = false;
    4598             : 
    4599             :         // Output values of resampling function
    4600             :         CPLErr eErr = CE_Failure;
    4601             :         void *pDstBuffer = nullptr;
    4602             :         GDALDataType eDstBufferDataType = GDT_Unknown;
    4603             : 
    4604             :         // Synchronization
    4605             :         bool bFinished = false;
    4606             :         std::mutex mutex{};
    4607             :         std::condition_variable cv{};
    4608             : 
    4609           0 :         void SetSrcMaskBufferHolder(
    4610             :             const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
    4611             :         {
    4612           0 :             oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
    4613           0 :         }
    4614             : 
    4615           0 :         void SetSrcBufferHolder(
    4616             :             const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
    4617             :         {
    4618           0 :             oSrcBufferHolder = oSrcBufferHolderIn;
    4619           0 :         }
    4620             :     };
    4621             : 
    4622             :     // Thread function to resample
    4623         782 :     const auto JobResampleFunc = [](void *pData)
    4624             :     {
    4625         782 :         OvrJob *poJob = static_cast<OvrJob *>(pData);
    4626             : 
    4627         782 :         if (poJob->bUseGenericResampleFn)
    4628             :         {
    4629         780 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    4630             :                                                &(poJob->pDstBuffer),
    4631             :                                                &(poJob->eDstBufferDataType));
    4632             :         }
    4633             :         else
    4634             :         {
    4635           2 :             poJob->eErr = GDALResampleChunkC32R(
    4636             :                 poJob->nSrcWidth, poJob->nSrcHeight,
    4637           2 :                 static_cast<const float *>(poJob->pChunk),
    4638             :                 poJob->args.nChunkYOff, poJob->args.nChunkYSize,
    4639             :                 poJob->args.nDstYOff, poJob->args.nDstYOff2,
    4640             :                 poJob->args.nOvrXSize, poJob->args.nOvrYSize,
    4641             :                 &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
    4642             :                 poJob->args.pszResampling);
    4643             :         }
    4644             : 
    4645             :         poJob->oDstBufferHolder =
    4646         782 :             std::make_unique<PointerHolder>(poJob->pDstBuffer);
    4647             : 
    4648             :         {
    4649        1564 :             std::lock_guard<std::mutex> guard(poJob->mutex);
    4650         782 :             poJob->bFinished = true;
    4651         782 :             poJob->cv.notify_one();
    4652             :         }
    4653         782 :     };
    4654             : 
    4655             :     // Function to write resample data to target band
    4656         782 :     const auto WriteJobData = [](const OvrJob *poJob)
    4657             :     {
    4658        1564 :         return poJob->poDstBand->RasterIO(
    4659         782 :             GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
    4660         782 :             poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    4661         782 :             poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    4662         782 :             poJob->eDstBufferDataType, 0, 0, nullptr);
    4663             :     };
    4664             : 
    4665             :     // Wait for completion of oldest job and serialize it
    4666             :     const auto WaitAndFinalizeOldestJob =
    4667           0 :         [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    4668             :     {
    4669           0 :         auto poOldestJob = jobList.front().get();
    4670             :         {
    4671           0 :             std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
    4672             :             // coverity[missing_lock:FALSE]
    4673           0 :             while (!poOldestJob->bFinished)
    4674             :             {
    4675           0 :                 poOldestJob->cv.wait(oGuard);
    4676             :             }
    4677             :         }
    4678           0 :         CPLErr l_eErr = poOldestJob->eErr;
    4679           0 :         if (l_eErr == CE_None)
    4680             :         {
    4681           0 :             l_eErr = WriteJobData(poOldestJob);
    4682             :         }
    4683             : 
    4684           0 :         jobList.pop_front();
    4685           0 :         return l_eErr;
    4686             :     };
    4687             : 
    4688             :     // Queue of jobs
    4689        1400 :     std::list<std::unique_ptr<OvrJob>> jobList;
    4690             : 
    4691         700 :     GByte *pabyChunkNodataMask = nullptr;
    4692         700 :     void *pChunk = nullptr;
    4693             : 
    4694         700 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    4695        2800 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    4696         700 :                                                        ? CPLGetNumCPUs()
    4697         700 :                                                        : atoi(pszThreads)));
    4698             :     auto poThreadPool =
    4699         700 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    4700             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    4701        1400 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    4702             : 
    4703             :     /* -------------------------------------------------------------------- */
    4704             :     /*      Loop over image operating on chunks.                            */
    4705             :     /* -------------------------------------------------------------------- */
    4706         700 :     int nChunkYOff = 0;
    4707         700 :     CPLErr eErr = CE_None;
    4708             : 
    4709        1405 :     for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
    4710         705 :          nChunkYOff += nFullResYChunk)
    4711             :     {
    4712         705 :         if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
    4713             :                          pProgressData))
    4714             :         {
    4715           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    4716           0 :             eErr = CE_Failure;
    4717             :         }
    4718             : 
    4719         705 :         if (nFullResYChunk + nChunkYOff > nHeight)
    4720         698 :             nFullResYChunk = nHeight - nChunkYOff;
    4721             : 
    4722         705 :         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
    4723         705 :         int nChunkYSizeQueried =
    4724         705 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    4725         705 :         if (nChunkYOffQueried < 0)
    4726             :         {
    4727          62 :             nChunkYSizeQueried += nChunkYOffQueried;
    4728          62 :             nChunkYOffQueried = 0;
    4729             :         }
    4730         705 :         if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
    4731          62 :             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
    4732             : 
    4733             :         // Avoid accumulating too many tasks and exhaust RAM
    4734             :         // Try to complete already finished jobs
    4735         705 :         while (eErr == CE_None && !jobList.empty())
    4736             :         {
    4737           0 :             auto poOldestJob = jobList.front().get();
    4738             :             {
    4739           0 :                 std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
    4740           0 :                 if (!poOldestJob->bFinished)
    4741             :                 {
    4742           0 :                     break;
    4743             :                 }
    4744             :             }
    4745           0 :             eErr = poOldestJob->eErr;
    4746           0 :             if (eErr == CE_None)
    4747             :             {
    4748           0 :                 eErr = WriteJobData(poOldestJob);
    4749             :             }
    4750             : 
    4751           0 :             jobList.pop_front();
    4752             :         }
    4753             : 
    4754             :         // And in case we have saturated the number of threads,
    4755             :         // wait for completion of tasks to go below the threshold.
    4756        1410 :         while (eErr == CE_None &&
    4757         705 :                jobList.size() >= static_cast<size_t>(nThreads))
    4758             :         {
    4759           0 :             eErr = WaitAndFinalizeOldestJob(jobList);
    4760             :         }
    4761             : 
    4762             :         // (Re)allocate buffers if needed
    4763         705 :         if (pChunk == nullptr)
    4764             :         {
    4765         700 :             pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    4766             :                                          nMaxChunkYSizeQueried, nWidth);
    4767             :         }
    4768         705 :         if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
    4769             :         {
    4770             :             pabyChunkNodataMask = static_cast<GByte *>(
    4771         274 :                 VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
    4772             :         }
    4773             : 
    4774         705 :         if (pChunk == nullptr ||
    4775         274 :             (bUseNoDataMask && pabyChunkNodataMask == nullptr))
    4776             :         {
    4777           0 :             CPLFree(pChunk);
    4778           0 :             CPLFree(pabyChunkNodataMask);
    4779           0 :             return CE_Failure;
    4780             :         }
    4781             : 
    4782             :         // Read chunk.
    4783         705 :         if (eErr == CE_None)
    4784         705 :             eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4785             :                                        nChunkYSizeQueried, pChunk, nWidth,
    4786             :                                        nChunkYSizeQueried, eWrkDataType, 0, 0,
    4787             :                                        nullptr);
    4788         705 :         if (eErr == CE_None && bUseNoDataMask)
    4789         274 :             eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4790             :                                         nChunkYSizeQueried, pabyChunkNodataMask,
    4791             :                                         nWidth, nChunkYSizeQueried, GDT_Byte, 0,
    4792             :                                         0, nullptr);
    4793             : 
    4794             :         // Special case to promote 1bit data to 8bit 0/255 values.
    4795         705 :         if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
    4796             :         {
    4797           9 :             if (eWrkDataType == GDT_Float32)
    4798             :             {
    4799           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4800           0 :                 for (GPtrDiff_t i = 0;
    4801           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4802             :                      i++)
    4803             :                 {
    4804           0 :                     if (pafChunk[i] == 1.0)
    4805           0 :                         pafChunk[i] = 255.0;
    4806             :                 }
    4807             :             }
    4808           9 :             else if (eWrkDataType == GDT_Byte)
    4809             :             {
    4810           9 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4811      168417 :                 for (GPtrDiff_t i = 0;
    4812      168417 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4813             :                      i++)
    4814             :                 {
    4815      168408 :                     if (pabyChunk[i] == 1)
    4816      127437 :                         pabyChunk[i] = 255;
    4817             :                 }
    4818             :             }
    4819           0 :             else if (eWrkDataType == GDT_UInt16)
    4820             :             {
    4821           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4822           0 :                 for (GPtrDiff_t i = 0;
    4823           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4824             :                      i++)
    4825             :                 {
    4826           0 :                     if (pasChunk[i] == 1)
    4827           0 :                         pasChunk[i] = 255;
    4828             :                 }
    4829             :             }
    4830           0 :             else if (eWrkDataType == GDT_Float64)
    4831             :             {
    4832           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4833           0 :                 for (GPtrDiff_t i = 0;
    4834           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4835             :                      i++)
    4836             :                 {
    4837           0 :                     if (padfChunk[i] == 1.0)
    4838           0 :                         padfChunk[i] = 255.0;
    4839             :                 }
    4840             :             }
    4841             :             else
    4842             :             {
    4843           0 :                 CPLAssert(false);
    4844             :             }
    4845             :         }
    4846         696 :         else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
    4847             :         {
    4848           0 :             if (eWrkDataType == GDT_Float32)
    4849             :             {
    4850           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4851           0 :                 for (GPtrDiff_t i = 0;
    4852           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4853             :                      i++)
    4854             :                 {
    4855           0 :                     if (pafChunk[i] == 1.0)
    4856           0 :                         pafChunk[i] = 0.0;
    4857           0 :                     else if (pafChunk[i] == 0.0)
    4858           0 :                         pafChunk[i] = 255.0;
    4859             :                 }
    4860             :             }
    4861           0 :             else if (eWrkDataType == GDT_Byte)
    4862             :             {
    4863           0 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4864           0 :                 for (GPtrDiff_t i = 0;
    4865           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4866             :                      i++)
    4867             :                 {
    4868           0 :                     if (pabyChunk[i] == 1)
    4869           0 :                         pabyChunk[i] = 0;
    4870           0 :                     else if (pabyChunk[i] == 0)
    4871           0 :                         pabyChunk[i] = 255;
    4872             :                 }
    4873             :             }
    4874           0 :             else if (eWrkDataType == GDT_UInt16)
    4875             :             {
    4876           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4877           0 :                 for (GPtrDiff_t i = 0;
    4878           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4879             :                      i++)
    4880             :                 {
    4881           0 :                     if (pasChunk[i] == 1)
    4882           0 :                         pasChunk[i] = 0;
    4883           0 :                     else if (pasChunk[i] == 0)
    4884           0 :                         pasChunk[i] = 255;
    4885             :                 }
    4886             :             }
    4887           0 :             else if (eWrkDataType == GDT_Float64)
    4888             :             {
    4889           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4890           0 :                 for (GPtrDiff_t i = 0;
    4891           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4892             :                      i++)
    4893             :                 {
    4894           0 :                     if (padfChunk[i] == 1.0)
    4895           0 :                         padfChunk[i] = 0.0;
    4896           0 :                     else if (padfChunk[i] == 0.0)
    4897           0 :                         padfChunk[i] = 255.0;
    4898             :                 }
    4899             :             }
    4900             :             else
    4901             :             {
    4902           0 :                 CPLAssert(false);
    4903             :             }
    4904             :         }
    4905             : 
    4906             :         auto oSrcBufferHolder =
    4907        1410 :             std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
    4908             :         auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
    4909        1410 :             poJobQueue ? pabyChunkNodataMask : nullptr);
    4910             : 
    4911        1487 :         for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
    4912             :              ++iOverview)
    4913             :         {
    4914         782 :             GDALRasterBand *poDstBand = papoOvrBands[iOverview];
    4915         782 :             const int nDstWidth = poDstBand->GetXSize();
    4916         782 :             const int nDstHeight = poDstBand->GetYSize();
    4917             : 
    4918         782 :             const double dfXRatioDstToSrc =
    4919         782 :                 static_cast<double>(nWidth) / nDstWidth;
    4920         782 :             const double dfYRatioDstToSrc =
    4921         782 :                 static_cast<double>(nHeight) / nDstHeight;
    4922             : 
    4923             :             /* --------------------------------------------------------------------
    4924             :              */
    4925             :             /*      Figure out the line to start writing to, and the first line
    4926             :              */
    4927             :             /*      to not write to.  In theory this approach should ensure that
    4928             :              */
    4929             :             /*      every output line will be written if all input chunks are */
    4930             :             /*      processed. */
    4931             :             /* --------------------------------------------------------------------
    4932             :              */
    4933         782 :             int nDstYOff =
    4934         782 :                 static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
    4935         782 :             if (nDstYOff == nDstHeight)
    4936           0 :                 continue;
    4937         782 :             int nDstYOff2 = static_cast<int>(
    4938         782 :                 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
    4939             : 
    4940         782 :             if (nChunkYOff + nFullResYChunk == nHeight)
    4941         775 :                 nDstYOff2 = nDstHeight;
    4942             : #if DEBUG_VERBOSE
    4943             :             CPLDebug("GDAL",
    4944             :                      "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
    4945             :                      nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
    4946             :                      nDstWidth, nDstYOff2 - nDstYOff);
    4947             : #endif
    4948             : 
    4949        1564 :             auto poJob = std::make_unique<OvrJob>();
    4950         782 :             poJob->pfnResampleFn = pfnResampleFn;
    4951         782 :             poJob->bUseGenericResampleFn = bUseGenericResampleFn;
    4952         782 :             poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
    4953         782 :             poJob->args.nOvrXSize = poDstBand->GetXSize();
    4954         782 :             poJob->args.nOvrYSize = poDstBand->GetYSize();
    4955             :             const char *pszNBITS =
    4956         782 :                 poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    4957         782 :             poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    4958         782 :             poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    4959         782 :             poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    4960         782 :             poJob->args.eWrkDataType = eWrkDataType;
    4961         782 :             poJob->pChunk = pChunk;
    4962         782 :             poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
    4963         782 :             poJob->nSrcWidth = nWidth;
    4964         782 :             poJob->nSrcHeight = nHeight;
    4965         782 :             poJob->args.nChunkXOff = 0;
    4966         782 :             poJob->args.nChunkXSize = nWidth;
    4967         782 :             poJob->args.nChunkYOff = nChunkYOff;
    4968         782 :             poJob->args.nChunkYSize = nChunkYSizeQueried;
    4969         782 :             poJob->nDstWidth = nDstWidth;
    4970         782 :             poJob->args.nDstXOff = 0;
    4971         782 :             poJob->args.nDstXOff2 = nDstWidth;
    4972         782 :             poJob->args.nDstYOff = nDstYOff;
    4973         782 :             poJob->args.nDstYOff2 = nDstYOff2;
    4974         782 :             poJob->poDstBand = poDstBand;
    4975         782 :             poJob->args.pszResampling = pszResampling;
    4976         782 :             poJob->args.bHasNoData = bHasNoData;
    4977         782 :             poJob->args.dfNoDataValue = dfNoDataValue;
    4978         782 :             poJob->args.poColorTable = poColorTable;
    4979         782 :             poJob->args.eSrcDataType = eSrcDataType;
    4980         782 :             poJob->args.bPropagateNoData = bPropagateNoData;
    4981             : 
    4982         782 :             if (poJobQueue)
    4983             :             {
    4984           0 :                 poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
    4985           0 :                 poJob->SetSrcBufferHolder(oSrcBufferHolder);
    4986           0 :                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    4987           0 :                 jobList.emplace_back(std::move(poJob));
    4988             :             }
    4989             :             else
    4990             :             {
    4991         782 :                 JobResampleFunc(poJob.get());
    4992         782 :                 eErr = poJob->eErr;
    4993         782 :                 if (eErr == CE_None)
    4994             :                 {
    4995         782 :                     eErr = WriteJobData(poJob.get());
    4996             :                 }
    4997             :             }
    4998             :         }
    4999             : 
    5000         705 :         if (poJobQueue)
    5001             :         {
    5002           0 :             pChunk = nullptr;
    5003           0 :             pabyChunkNodataMask = nullptr;
    5004             :         }
    5005             :     }
    5006             : 
    5007         700 :     VSIFree(pChunk);
    5008         700 :     VSIFree(pabyChunkNodataMask);
    5009             : 
    5010             :     // Wait for all pending jobs to complete
    5011         700 :     while (!jobList.empty())
    5012             :     {
    5013           0 :         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5014           0 :         if (l_eErr != CE_None && eErr == CE_None)
    5015           0 :             eErr = l_eErr;
    5016             :     }
    5017             : 
    5018             :     /* -------------------------------------------------------------------- */
    5019             :     /*      Renormalized overview mean / stddev if needed.                  */
    5020             :     /* -------------------------------------------------------------------- */
    5021         700 :     if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
    5022             :     {
    5023           0 :         GDALOverviewMagnitudeCorrection(
    5024             :             poSrcBand, nOverviewCount,
    5025             :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
    5026             :             GDALDummyProgress, nullptr);
    5027             :     }
    5028             : 
    5029             :     /* -------------------------------------------------------------------- */
    5030             :     /*      It can be important to flush out data to overviews.             */
    5031             :     /* -------------------------------------------------------------------- */
    5032        1475 :     for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
    5033             :          ++iOverview)
    5034             :     {
    5035         775 :         eErr = papoOvrBands[iOverview]->FlushCache(false);
    5036             :     }
    5037             : 
    5038         700 :     if (eErr == CE_None)
    5039         700 :         pfnProgress(1.0, nullptr, pProgressData);
    5040             : 
    5041         700 :     return eErr;
    5042             : }
    5043             : 
    5044             : /************************************************************************/
    5045             : /*            GDALRegenerateOverviewsMultiBand()                        */
    5046             : /************************************************************************/
    5047             : 
    5048             : /**
    5049             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5050             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5051             :  *
    5052             :  * This function will generate one or more overview images from a base
    5053             :  * image using the requested downsampling algorithm.  Its primary use
    5054             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5055             :  * can also be used to generate downsampled images in one file from another
    5056             :  * outside the overview architecture.
    5057             :  *
    5058             :  * The output bands need to exist in advance and share the same characteristics
    5059             :  * (type, dimensions)
    5060             :  *
    5061             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5062             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5063             :  *
    5064             :  * It does not support color tables or complex data types.
    5065             :  *
    5066             :  * The pseudo-algorithm used by the function is :
    5067             :  *    for each overview
    5068             :  *       iterate on lines of the source by a step of deltay
    5069             :  *           iterate on columns of the source  by a step of deltax
    5070             :  *               read the source data of size deltax * deltay for all the bands
    5071             :  *               generate the corresponding overview block for all the bands
    5072             :  *
    5073             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5074             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5075             :  * considered as the nodata value and not each value of the triplet
    5076             :  * independently per band.
    5077             :  *
    5078             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    5079             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5080             :  * overview computation.
    5081             :  *
    5082             :  * @param nBands the number of bands, size of papoSrcBands and size of
    5083             :  *               first dimension of papapoOverviewBands
    5084             :  * @param papoSrcBands the list of source bands to downsample
    5085             :  * @param nOverviews the number of downsampled overview levels being generated.
    5086             :  * @param papapoOverviewBands bidimension array of bands. First dimension is
    5087             :  *                            indexed by nBands. Second dimension is indexed by
    5088             :  *                            nOverviews.
    5089             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5090             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5091             :  * @param pfnProgress progress report function.
    5092             :  * @param pProgressData progress function callback data.
    5093             :  * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
    5094             :  *                     key=value pairs, or NULL
    5095             :  *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
    5096             :  *                     options can be specified to express that overviews should
    5097             :  *                     be regenerated only in the specified subset of the source
    5098             :  *                     dataset.
    5099             :  * @return CE_None on success or CE_Failure on failure.
    5100             :  */
    5101             : 
    5102         354 : CPLErr GDALRegenerateOverviewsMultiBand(
    5103             :     int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
    5104             :     GDALRasterBand *const *const *papapoOverviewBands,
    5105             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5106             :     void *pProgressData, CSLConstList papszOptions)
    5107             : {
    5108         354 :     CPL_IGNORE_RET_VAL(papszOptions);
    5109             : 
    5110         354 :     if (pfnProgress == nullptr)
    5111           6 :         pfnProgress = GDALDummyProgress;
    5112             : 
    5113         354 :     if (EQUAL(pszResampling, "NONE"))
    5114           2 :         return CE_None;
    5115             : 
    5116             :     // Sanity checks.
    5117         352 :     if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
    5118         169 :         !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
    5119          70 :         !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
    5120          18 :         !EQUAL(pszResampling, "CUBICSPLINE") &&
    5121          17 :         !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
    5122           5 :         !EQUAL(pszResampling, "MODE"))
    5123             :     {
    5124           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    5125             :                  "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
    5126             :                  "not supported",
    5127             :                  pszResampling);
    5128           0 :         return CE_Failure;
    5129             :     }
    5130             : 
    5131         352 :     int nKernelRadius = 0;
    5132             :     GDALResampleFunction pfnResampleFn =
    5133         352 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    5134         352 :     if (pfnResampleFn == nullptr)
    5135           0 :         return CE_Failure;
    5136             : 
    5137         352 :     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
    5138         352 :     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
    5139         352 :     if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
    5140           0 :         return CE_None;
    5141         352 :     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
    5142         647 :     for (int iBand = 1; iBand < nBands; ++iBand)
    5143             :     {
    5144         590 :         if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
    5145         295 :             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
    5146             :         {
    5147           0 :             CPLError(
    5148             :                 CE_Failure, CPLE_NotSupported,
    5149             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5150             :                 "have the same dimensions");
    5151           0 :             return CE_Failure;
    5152             :         }
    5153         295 :         if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
    5154             :         {
    5155           0 :             CPLError(
    5156             :                 CE_Failure, CPLE_NotSupported,
    5157             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5158             :                 "have the same data type");
    5159           0 :             return CE_Failure;
    5160             :         }
    5161             :     }
    5162             : 
    5163         938 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5164             :     {
    5165         586 :         const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
    5166         586 :         const int nDstWidth = poOvrFirstBand->GetXSize();
    5167         586 :         const int nDstHeight = poOvrFirstBand->GetYSize();
    5168        1151 :         for (int iBand = 1; iBand < nBands; ++iBand)
    5169             :         {
    5170         565 :             const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
    5171        1130 :             if (poOvrBand->GetXSize() != nDstWidth ||
    5172         565 :                 poOvrBand->GetYSize() != nDstHeight)
    5173             :             {
    5174           0 :                 CPLError(
    5175             :                     CE_Failure, CPLE_NotSupported,
    5176             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5177             :                     "of the same level must have the same dimensions");
    5178           0 :                 return CE_Failure;
    5179             :             }
    5180         565 :             if (poOvrBand->GetRasterDataType() != eDataType)
    5181             :             {
    5182           0 :                 CPLError(
    5183             :                     CE_Failure, CPLE_NotSupported,
    5184             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5185             :                     "must have the same data type as the source bands");
    5186           0 :                 return CE_Failure;
    5187             :             }
    5188             :         }
    5189             :     }
    5190             : 
    5191             :     // First pass to compute the total number of pixels to write.
    5192         352 :     double dfTotalPixelCount = 0;
    5193         352 :     const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
    5194         352 :     const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
    5195         352 :     const int nSrcXSize = atoi(CSLFetchNameValueDef(
    5196             :         papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
    5197         352 :     const int nSrcYSize = atoi(CSLFetchNameValueDef(
    5198             :         papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
    5199         938 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5200             :     {
    5201         586 :         dfTotalPixelCount +=
    5202        1172 :             static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5203         586 :             papapoOverviewBands[0][iOverview]->GetXSize() *
    5204        1172 :             static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5205         586 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5206             :     }
    5207             : 
    5208             :     const GDALDataType eWrkDataType =
    5209         352 :         GDALGetOvrWorkDataType(pszResampling, eDataType);
    5210         352 :     const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
    5211             : 
    5212         352 :     const bool bIsMask = papoSrcBands[0]->IsMaskBand();
    5213             : 
    5214             :     // If we have a nodata mask and we are doing something more complicated
    5215             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5216             :     const bool bUseNoDataMask =
    5217         515 :         !STARTS_WITH_CI(pszResampling, "NEAR") &&
    5218         163 :         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
    5219             : 
    5220             :     bool *const pabHasNoData =
    5221         352 :         static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
    5222             :     double *const padfNoDataValue =
    5223         352 :         static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
    5224         352 :     if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
    5225             :     {
    5226           0 :         CPLFree(pabHasNoData);
    5227           0 :         CPLFree(padfNoDataValue);
    5228           0 :         return CE_Failure;
    5229             :     }
    5230             : 
    5231         999 :     for (int iBand = 0; iBand < nBands; ++iBand)
    5232             :     {
    5233         647 :         int nHasNoData = 0;
    5234        1294 :         padfNoDataValue[iBand] =
    5235         647 :             papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
    5236         647 :         pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
    5237             :     }
    5238             :     const bool bPropagateNoData =
    5239         352 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5240             : 
    5241         352 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    5242        1408 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    5243         352 :                                                        ? CPLGetNumCPUs()
    5244         352 :                                                        : atoi(pszThreads)));
    5245             :     auto poThreadPool =
    5246         352 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5247             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5248         352 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5249             : 
    5250             :     // Only configurable for debug / testing
    5251             :     const int nChunkMaxSize = std::max(
    5252         352 :         100, atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760")));
    5253             : 
    5254             :     // Second pass to do the real job.
    5255         352 :     double dfCurPixelCount = 0;
    5256         352 :     CPLErr eErr = CE_None;
    5257         937 :     for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
    5258             :          ++iOverview)
    5259             :     {
    5260         585 :         int iSrcOverview = -1;  // -1 means the source bands.
    5261             : 
    5262             :         const int nDstTotalWidth =
    5263         585 :             papapoOverviewBands[0][iOverview]->GetXSize();
    5264             :         const int nDstTotalHeight =
    5265         585 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5266             : 
    5267             :         // Compute the coordinates of the target region to refresh
    5268         585 :         constexpr double EPS = 1e-8;
    5269         585 :         const int nDstXOffStart = static_cast<int>(
    5270         585 :             static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
    5271             :             EPS);
    5272             :         const int nDstXOffEnd =
    5273        1170 :             std::min(static_cast<int>(
    5274         585 :                          std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
    5275         585 :                                        nToplevelSrcWidth * nDstTotalWidth -
    5276             :                                    EPS)),
    5277         585 :                      nDstTotalWidth);
    5278         585 :         const int nDstWidth = nDstXOffEnd - nDstXOffStart;
    5279         585 :         const int nDstYOffStart =
    5280         585 :             static_cast<int>(static_cast<double>(nSrcYOff) /
    5281         585 :                                  nToplevelSrcHeight * nDstTotalHeight +
    5282             :                              EPS);
    5283             :         const int nDstYOffEnd =
    5284        1170 :             std::min(static_cast<int>(
    5285         585 :                          std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
    5286         585 :                                        nToplevelSrcHeight * nDstTotalHeight -
    5287             :                                    EPS)),
    5288         585 :                      nDstTotalHeight);
    5289             : 
    5290             :         // Try to use previous level of overview as the source to compute
    5291             :         // the next level.
    5292         585 :         int nSrcWidth = nToplevelSrcWidth;
    5293         585 :         int nSrcHeight = nToplevelSrcHeight;
    5294         818 :         if (iOverview > 0 &&
    5295         233 :             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
    5296             :         {
    5297         225 :             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
    5298         225 :             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
    5299         225 :             iSrcOverview = iOverview - 1;
    5300             :         }
    5301             : 
    5302         585 :         const double dfXRatioDstToSrc =
    5303         585 :             static_cast<double>(nSrcWidth) / nDstTotalWidth;
    5304         585 :         const double dfYRatioDstToSrc =
    5305         585 :             static_cast<double>(nSrcHeight) / nDstTotalHeight;
    5306             : 
    5307        1170 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    5308         585 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    5309         585 :         if (nOvrFactor == 0)
    5310           0 :             nOvrFactor = 1;
    5311             : 
    5312         585 :         int nDstChunkXSize = 0;
    5313         585 :         int nDstChunkYSize = 0;
    5314         585 :         papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
    5315             :                                                         &nDstChunkYSize);
    5316             : 
    5317             :         const char *pszDST_CHUNK_X_SIZE =
    5318         585 :             CSLFetchNameValue(papszOptions, "DST_CHUNK_X_SIZE");
    5319             :         const char *pszDST_CHUNK_Y_SIZE =
    5320         585 :             CSLFetchNameValue(papszOptions, "DST_CHUNK_Y_SIZE");
    5321         585 :         if (pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE)
    5322             :         {
    5323          12 :             nDstChunkXSize = std::max(1, atoi(pszDST_CHUNK_X_SIZE));
    5324          12 :             nDstChunkYSize = std::max(1, atoi(pszDST_CHUNK_Y_SIZE));
    5325          12 :             CPLDebug("GDAL", "Using dst chunk size %d x %d", nDstChunkXSize,
    5326             :                      nDstChunkYSize);
    5327             :         }
    5328             : 
    5329             :         // Try to extend the chunk size so that the memory needed to acquire
    5330             :         // source pixels goes up to 10 MB.
    5331             :         // This can help for drivers that support multi-threaded reading
    5332         585 :         const int nFullResYChunk =
    5333         585 :             2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
    5334         585 :         const int nFullResYChunkQueried =
    5335         585 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    5336         821 :         while (nDstChunkXSize < nDstWidth)
    5337             :         {
    5338         253 :             const int nFullResXChunk =
    5339         253 :                 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
    5340             : 
    5341         253 :             const int nFullResXChunkQueried =
    5342         253 :                 nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    5343             : 
    5344         253 :             if (static_cast<GIntBig>(nFullResXChunkQueried) *
    5345         253 :                     nFullResYChunkQueried * nBands * nWrkDataTypeSize >
    5346         253 :                 nChunkMaxSize)
    5347             :             {
    5348          17 :                 break;
    5349             :             }
    5350             : 
    5351         236 :             nDstChunkXSize *= 2;
    5352             :         }
    5353         585 :         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
    5354             : 
    5355         585 :         const int nFullResXChunk =
    5356         585 :             2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
    5357         585 :         const int nFullResXChunkQueried =
    5358         585 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    5359             : 
    5360             :         // Make sure that the RAM requirements to acquire the source data does
    5361             :         // not exceed nChunkMaxSize
    5362             :         // If so, reduce the destination chunk size, generate overviews in a
    5363             :         // temporary dataset, and copy that temporary dataset over the target
    5364             :         // overview bands (to avoid issues with lossy compression)
    5365         585 :         const auto nMemRequirement =
    5366         585 :             static_cast<GIntBig>(nFullResXChunkQueried) *
    5367         585 :             nFullResYChunkQueried * nBands * nWrkDataTypeSize;
    5368         585 :         if (nMemRequirement > nChunkMaxSize &&
    5369          10 :             !(pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE))
    5370             :         {
    5371             :             // Compute a smaller destination chunk size
    5372          12 :             const auto nOverShootFactor = nMemRequirement / nChunkMaxSize;
    5373             :             const auto nSqrtOverShootFactor = std::max<GIntBig>(
    5374          24 :                 4, static_cast<GIntBig>(std::ceil(
    5375          12 :                        std::sqrt(static_cast<double>(nOverShootFactor)))));
    5376             :             const int nReducedDstChunkXSize = std::max(
    5377          12 :                 1, static_cast<int>(nDstChunkXSize / nSqrtOverShootFactor));
    5378             :             const int nReducedDstChunkYSize = std::max(
    5379          12 :                 1, static_cast<int>(nDstChunkYSize / nSqrtOverShootFactor));
    5380          12 :             if (nReducedDstChunkXSize < nDstChunkXSize ||
    5381           0 :                 nReducedDstChunkYSize < nDstChunkYSize)
    5382             :             {
    5383          12 :                 CPLStringList aosOptions(papszOptions);
    5384             :                 aosOptions.SetNameValue(
    5385             :                     "DST_CHUNK_X_SIZE",
    5386          12 :                     CPLSPrintf("%d", nReducedDstChunkXSize));
    5387             :                 aosOptions.SetNameValue(
    5388             :                     "DST_CHUNK_Y_SIZE",
    5389          12 :                     CPLSPrintf("%d", nReducedDstChunkYSize));
    5390             : 
    5391             :                 const auto nTmpDSMemRequirement =
    5392          12 :                     static_cast<GIntBig>(nDstTotalWidth) * nDstTotalHeight *
    5393          12 :                     nBands * GDALGetDataTypeSizeBytes(eDataType);
    5394           0 :                 std::unique_ptr<GDALDataset> poTmpDS;
    5395             :                 // Config option mostly/only for autotest purposes
    5396             :                 const char *pszGDAL_OVR_TEMP_DRIVER =
    5397          12 :                     CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
    5398          12 :                 if ((nTmpDSMemRequirement <= nChunkMaxSize &&
    5399           2 :                      !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
    5400          10 :                     EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
    5401             :                 {
    5402             :                     auto poTmpDrv =
    5403          11 :                         GetGDALDriverManager()->GetDriverByName("MEM");
    5404          11 :                     if (!poTmpDrv)
    5405             :                     {
    5406           0 :                         eErr = CE_Failure;
    5407           0 :                         break;
    5408             :                     }
    5409          11 :                     poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
    5410             :                                                    nDstTotalHeight, nBands,
    5411          11 :                                                    eDataType, nullptr));
    5412             :                 }
    5413             :                 else
    5414             :                 {
    5415             :                     auto poTmpDrv =
    5416           1 :                         GetGDALDriverManager()->GetDriverByName("GTiff");
    5417           1 :                     if (!poTmpDrv)
    5418             :                     {
    5419           0 :                         eErr = CE_Failure;
    5420           0 :                         break;
    5421             :                     }
    5422           2 :                     std::string osTmpFilename;
    5423           1 :                     auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
    5424           1 :                     if (poDstDS)
    5425             :                     {
    5426           1 :                         osTmpFilename = poDstDS->GetDescription();
    5427             :                         VSIStatBufL sStatBuf;
    5428           1 :                         if (!osTmpFilename.empty() &&
    5429           0 :                             VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
    5430           0 :                             osTmpFilename += "_tmp_ovr.tif";
    5431             :                     }
    5432           1 :                     if (osTmpFilename.empty())
    5433             :                     {
    5434           1 :                         osTmpFilename = CPLGenerateTempFilename(nullptr);
    5435           1 :                         osTmpFilename += ".tif";
    5436             :                     }
    5437           1 :                     CPLDebug("GDAL",
    5438             :                              "Creating temporary file %s of %d x %d x %d",
    5439             :                              osTmpFilename.c_str(), nDstTotalWidth,
    5440             :                              nDstTotalHeight, nBands);
    5441           2 :                     CPLStringList aosCO;
    5442           1 :                     poTmpDS.reset(poTmpDrv->Create(
    5443             :                         osTmpFilename.c_str(), nDstTotalWidth, nDstTotalHeight,
    5444           1 :                         nBands, eDataType, aosCO.List()));
    5445           1 :                     if (poTmpDS)
    5446             :                     {
    5447           1 :                         poTmpDS->MarkSuppressOnClose();
    5448           1 :                         VSIUnlink(osTmpFilename.c_str());
    5449             :                     }
    5450             :                 }
    5451          12 :                 if (!poTmpDS)
    5452             :                 {
    5453           0 :                     eErr = CE_Failure;
    5454           0 :                     break;
    5455             :                 }
    5456             : 
    5457          12 :                 std::vector<GDALRasterBand **> apapoOverviewBands(nBands);
    5458          27 :                 for (int i = 0; i < nBands; ++i)
    5459             :                 {
    5460          30 :                     apapoOverviewBands[i] = static_cast<GDALRasterBand **>(
    5461          15 :                         CPLMalloc(sizeof(GDALRasterBand *)));
    5462          15 :                     apapoOverviewBands[i][0] = poTmpDS->GetRasterBand(i + 1);
    5463             :                 }
    5464             : 
    5465             :                 const double dfExtraPixels =
    5466          24 :                     static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5467          12 :                     papapoOverviewBands[0][iOverview]->GetXSize() *
    5468          24 :                     static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5469          12 :                     papapoOverviewBands[0][iOverview]->GetYSize();
    5470             : 
    5471          24 :                 void *pScaledProgressData = GDALCreateScaledProgress(
    5472             :                     dfCurPixelCount / dfTotalPixelCount,
    5473          12 :                     (dfCurPixelCount + dfExtraPixels) / dfTotalPixelCount,
    5474             :                     pfnProgress, pProgressData);
    5475             : 
    5476             :                 // Generate overviews in temporary dataset
    5477          12 :                 eErr = GDALRegenerateOverviewsMultiBand(
    5478          12 :                     nBands, papoSrcBands, 1, apapoOverviewBands.data(),
    5479             :                     pszResampling, GDALScaledProgress, pScaledProgressData,
    5480          12 :                     aosOptions.List());
    5481             : 
    5482          12 :                 GDALDestroyScaledProgress(pScaledProgressData);
    5483             : 
    5484          12 :                 dfCurPixelCount += dfExtraPixels;
    5485             : 
    5486          27 :                 for (int i = 0; i < nBands; ++i)
    5487             :                 {
    5488          15 :                     CPLFree(apapoOverviewBands[i]);
    5489             :                 }
    5490             : 
    5491             :                 // Copy temporary dataset to destination overview bands
    5492             : 
    5493          12 :                 if (eErr == CE_None)
    5494             :                 {
    5495             :                     // Check if all papapoOverviewBands[][iOverview] bands point
    5496             :                     // to the same dataset. If so, we can use
    5497             :                     // GDALDatasetCopyWholeRaster()
    5498             :                     GDALDataset *poDstOvrBandDS =
    5499          12 :                         papapoOverviewBands[0][iOverview]->GetDataset();
    5500          12 :                     if (poDstOvrBandDS)
    5501             :                     {
    5502          15 :                         if (poDstOvrBandDS->GetRasterCount() != nBands ||
    5503           3 :                             poDstOvrBandDS->GetRasterBand(1) !=
    5504           3 :                                 papapoOverviewBands[0][iOverview])
    5505             :                         {
    5506           9 :                             poDstOvrBandDS = nullptr;
    5507             :                         }
    5508             :                         else
    5509             :                         {
    5510           6 :                             for (int i = 1; poDstOvrBandDS && i < nBands; ++i)
    5511             :                             {
    5512             :                                 GDALDataset *poThisDstOvrBandDS =
    5513           3 :                                     papapoOverviewBands[i][iOverview]
    5514           3 :                                         ->GetDataset();
    5515           3 :                                 if (poThisDstOvrBandDS == nullptr ||
    5516           6 :                                     poThisDstOvrBandDS != poDstOvrBandDS ||
    5517           3 :                                     poThisDstOvrBandDS->GetRasterBand(i + 1) !=
    5518           3 :                                         papapoOverviewBands[i][iOverview])
    5519             :                                 {
    5520           0 :                                     poDstOvrBandDS = nullptr;
    5521             :                                 }
    5522             :                             }
    5523             :                         }
    5524             :                     }
    5525          12 :                     if (poDstOvrBandDS)
    5526             :                     {
    5527           3 :                         eErr = GDALDatasetCopyWholeRaster(
    5528             :                             GDALDataset::ToHandle(poTmpDS.get()),
    5529             :                             GDALDataset::ToHandle(poDstOvrBandDS), nullptr,
    5530             :                             nullptr, nullptr);
    5531             :                     }
    5532             :                     else
    5533             :                     {
    5534          18 :                         for (int i = 0; eErr == CE_None && i < nBands; ++i)
    5535             :                         {
    5536           9 :                             eErr = GDALRasterBandCopyWholeRaster(
    5537             :                                 GDALRasterBand::ToHandle(
    5538             :                                     poTmpDS->GetRasterBand(i + 1)),
    5539             :                                 GDALRasterBand::ToHandle(
    5540           9 :                                     papapoOverviewBands[i][iOverview]),
    5541             :                                 nullptr, nullptr, nullptr);
    5542             :                         }
    5543             :                     }
    5544             :                 }
    5545             : 
    5546          12 :                 if (eErr != CE_None)
    5547           0 :                     break;
    5548             : 
    5549          12 :                 continue;
    5550             :             }
    5551             :         }
    5552             : 
    5553             :         // Structure describing a resampling job
    5554             :         struct OvrJob
    5555             :         {
    5556             :             // Buffers to free when job is finished
    5557             :             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
    5558             :             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
    5559             :             std::unique_ptr<PointerHolder> oDstBufferHolder{};
    5560             : 
    5561             :             GDALRasterBand *poDstBand = nullptr;
    5562             : 
    5563             :             // Input parameters of pfnResampleFn
    5564             :             GDALResampleFunction pfnResampleFn = nullptr;
    5565             :             GDALOverviewResampleArgs args{};
    5566             :             const void *pChunk = nullptr;
    5567             : 
    5568             :             // Output values of resampling function
    5569             :             CPLErr eErr = CE_Failure;
    5570             :             void *pDstBuffer = nullptr;
    5571             :             GDALDataType eDstBufferDataType = GDT_Unknown;
    5572             : 
    5573             :             // Synchronization
    5574             :             bool bFinished = false;
    5575             :             std::mutex mutex{};
    5576             :             std::condition_variable cv{};
    5577             :         };
    5578             : 
    5579             :         // Thread function to resample
    5580       16228 :         const auto JobResampleFunc = [](void *pData)
    5581             :         {
    5582       16228 :             OvrJob *poJob = static_cast<OvrJob *>(pData);
    5583             : 
    5584       16228 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    5585             :                                                &(poJob->pDstBuffer),
    5586             :                                                &(poJob->eDstBufferDataType));
    5587             : 
    5588       16224 :             poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
    5589             : 
    5590             :             {
    5591       32460 :                 std::lock_guard<std::mutex> guard(poJob->mutex);
    5592       16230 :                 poJob->bFinished = true;
    5593       16230 :                 poJob->cv.notify_one();
    5594             :             }
    5595       16230 :         };
    5596             : 
    5597             :         // Function to write resample data to target band
    5598       16230 :         const auto WriteJobData = [](const OvrJob *poJob)
    5599             :         {
    5600       32460 :             return poJob->poDstBand->RasterIO(
    5601       16230 :                 GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
    5602       16230 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    5603       16230 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    5604       16230 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    5605       16230 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    5606       16230 :                 poJob->eDstBufferDataType, 0, 0, nullptr);
    5607             :         };
    5608             : 
    5609             :         // Wait for completion of oldest job and serialize it
    5610             :         const auto WaitAndFinalizeOldestJob =
    5611          15 :             [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    5612             :         {
    5613          15 :             auto poOldestJob = jobList.front().get();
    5614             :             {
    5615          30 :                 std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
    5616             :                 // coverity[missing_lock:FALSE]
    5617          18 :                 while (!poOldestJob->bFinished)
    5618             :                 {
    5619           3 :                     poOldestJob->cv.wait(oGuard);
    5620             :                 }
    5621             :             }
    5622          15 :             CPLErr l_eErr = poOldestJob->eErr;
    5623          15 :             if (l_eErr == CE_None)
    5624             :             {
    5625          15 :                 l_eErr = WriteJobData(poOldestJob);
    5626             :             }
    5627             : 
    5628          15 :             jobList.pop_front();
    5629          15 :             return l_eErr;
    5630             :         };
    5631             : 
    5632             :         // Queue of jobs
    5633        1146 :         std::list<std::unique_ptr<OvrJob>> jobList;
    5634             : 
    5635        1146 :         std::vector<void *> apaChunk(nBands);
    5636        1146 :         std::vector<GByte *> apabyChunkNoDataMask(nBands);
    5637             : 
    5638             :         // Iterate on destination overview, block by block.
    5639         573 :         for (int nDstYOff = nDstYOffStart;
    5640        2209 :              nDstYOff < nDstYOffEnd && eErr == CE_None;
    5641        1636 :              nDstYOff += nDstChunkYSize)
    5642             :         {
    5643             :             int nDstYCount;
    5644        1636 :             if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
    5645        1248 :                 nDstYCount = nDstChunkYSize;
    5646             :             else
    5647         388 :                 nDstYCount = nDstYOffEnd - nDstYOff;
    5648             : 
    5649        1636 :             int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    5650        1636 :             int nChunkYOff2 = static_cast<int>(
    5651        1636 :                 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
    5652        1636 :             if (nChunkYOff2 > nSrcHeight ||
    5653        1636 :                 nDstYOff + nDstYCount == nDstTotalHeight)
    5654         570 :                 nChunkYOff2 = nSrcHeight;
    5655        1636 :             int nYCount = nChunkYOff2 - nChunkYOff;
    5656        1636 :             CPLAssert(nYCount <= nFullResYChunk);
    5657             : 
    5658        1636 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    5659        1636 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    5660        1636 :             if (nChunkYOffQueried < 0)
    5661             :             {
    5662         126 :                 nChunkYSizeQueried += nChunkYOffQueried;
    5663         126 :                 nChunkYOffQueried = 0;
    5664             :             }
    5665        1636 :             if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
    5666         125 :                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
    5667        1636 :             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
    5668             : 
    5669        1636 :             if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
    5670             :                              pProgressData))
    5671             :             {
    5672           1 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5673           1 :                 eErr = CE_Failure;
    5674             :             }
    5675             : 
    5676             :             // Iterate on destination overview, block by block.
    5677        1636 :             for (int nDstXOff = nDstXOffStart;
    5678       10051 :                  nDstXOff < nDstXOffEnd && eErr == CE_None;
    5679        8415 :                  nDstXOff += nDstChunkXSize)
    5680             :             {
    5681        8415 :                 int nDstXCount = 0;
    5682        8415 :                 if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
    5683        8218 :                     nDstXCount = nDstChunkXSize;
    5684             :                 else
    5685         197 :                     nDstXCount = nDstXOffEnd - nDstXOff;
    5686             : 
    5687        8415 :                 dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
    5688             : 
    5689        8415 :                 int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    5690        8415 :                 int nChunkXOff2 = static_cast<int>(
    5691        8415 :                     ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    5692        8415 :                 if (nChunkXOff2 > nSrcWidth ||
    5693        8415 :                     nDstXOff + nDstXCount == nDstTotalWidth)
    5694        1634 :                     nChunkXOff2 = nSrcWidth;
    5695        8415 :                 const int nXCount = nChunkXOff2 - nChunkXOff;
    5696        8415 :                 CPLAssert(nXCount <= nFullResXChunk);
    5697             : 
    5698        8415 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    5699        8415 :                 int nChunkXSizeQueried =
    5700        8415 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    5701        8415 :                 if (nChunkXOffQueried < 0)
    5702             :                 {
    5703         186 :                     nChunkXSizeQueried += nChunkXOffQueried;
    5704         186 :                     nChunkXOffQueried = 0;
    5705             :                 }
    5706        8415 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
    5707         189 :                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
    5708        8415 :                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
    5709             : #if DEBUG_VERBOSE
    5710             :                 CPLDebug("GDAL",
    5711             :                          "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
    5712             :                          nChunkXOffQueried, nChunkYOffQueried,
    5713             :                          nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
    5714             :                          nDstYOff, nDstXCount, nDstYCount);
    5715             : #endif
    5716             : 
    5717             :                 // Avoid accumulating too many tasks and exhaust RAM
    5718             : 
    5719             :                 // Try to complete already finished jobs
    5720       16512 :                 while (eErr == CE_None && !jobList.empty())
    5721             :                 {
    5722        8192 :                     auto poOldestJob = jobList.front().get();
    5723             :                     {
    5724        8192 :                         std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
    5725        8192 :                         if (!poOldestJob->bFinished)
    5726             :                         {
    5727          95 :                             break;
    5728             :                         }
    5729             :                     }
    5730        8097 :                     eErr = poOldestJob->eErr;
    5731        8097 :                     if (eErr == CE_None)
    5732             :                     {
    5733        8097 :                         eErr = WriteJobData(poOldestJob);
    5734             :                     }
    5735             : 
    5736        8097 :                     jobList.pop_front();
    5737             :                 }
    5738             : 
    5739             :                 // And in case we have saturated the number of threads,
    5740             :                 // wait for completion of tasks to go below the threshold.
    5741       16830 :                 while (eErr == CE_None &&
    5742        8415 :                        jobList.size() >= static_cast<size_t>(nThreads))
    5743             :                 {
    5744           0 :                     eErr = WaitAndFinalizeOldestJob(jobList);
    5745             :                 }
    5746             : 
    5747             :                 // (Re)allocate buffers if needed
    5748       24646 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5749             :                 {
    5750       16231 :                     if (apaChunk[iBand] == nullptr)
    5751             :                     {
    5752        9233 :                         apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
    5753             :                             nFullResXChunkQueried, nFullResYChunkQueried,
    5754             :                             nWrkDataTypeSize);
    5755        9233 :                         if (apaChunk[iBand] == nullptr)
    5756             :                         {
    5757           0 :                             eErr = CE_Failure;
    5758             :                         }
    5759             :                     }
    5760       24644 :                     if (bUseNoDataMask &&
    5761        8413 :                         apabyChunkNoDataMask[iBand] == nullptr)
    5762             :                     {
    5763       16708 :                         apabyChunkNoDataMask[iBand] =
    5764        8354 :                             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    5765             :                                 nFullResXChunkQueried, nFullResYChunkQueried));
    5766        8354 :                         if (apabyChunkNoDataMask[iBand] == nullptr)
    5767             :                         {
    5768           0 :                             eErr = CE_Failure;
    5769             :                         }
    5770             :                     }
    5771             :                 }
    5772             : 
    5773             :                 // Read the source buffers for all the bands.
    5774       24646 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    5775             :                 {
    5776       16231 :                     GDALRasterBand *poSrcBand = nullptr;
    5777       16231 :                     if (iSrcOverview == -1)
    5778       15337 :                         poSrcBand = papoSrcBands[iBand];
    5779             :                     else
    5780         894 :                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
    5781       16231 :                     eErr = poSrcBand->RasterIO(
    5782             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    5783       16231 :                         nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
    5784             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
    5785             :                         0, nullptr);
    5786             : 
    5787       16231 :                     if (bUseNoDataMask && eErr == CE_None)
    5788             :                     {
    5789        8413 :                         auto poMaskBand = poSrcBand->IsMaskBand()
    5790        8413 :                                               ? poSrcBand
    5791        6312 :                                               : poSrcBand->GetMaskBand();
    5792        8413 :                         eErr = poMaskBand->RasterIO(
    5793             :                             GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    5794             :                             nChunkXSizeQueried, nChunkYSizeQueried,
    5795        8413 :                             apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
    5796             :                             nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    5797             :                     }
    5798             :                 }
    5799             : 
    5800             :                 // Compute the resulting overview block.
    5801       24645 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    5802             :                 {
    5803       32460 :                     auto poJob = std::make_unique<OvrJob>();
    5804       16230 :                     poJob->pfnResampleFn = pfnResampleFn;
    5805       16230 :                     poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
    5806       32460 :                     poJob->args.eOvrDataType =
    5807       16230 :                         poJob->poDstBand->GetRasterDataType();
    5808       16230 :                     poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
    5809       16230 :                     poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
    5810       16230 :                     const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
    5811       16230 :                         "NBITS", "IMAGE_STRUCTURE");
    5812       16230 :                     poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    5813       16230 :                     poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    5814       16230 :                     poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    5815       16230 :                     poJob->args.eWrkDataType = eWrkDataType;
    5816       16230 :                     poJob->pChunk = apaChunk[iBand];
    5817       16230 :                     poJob->args.pabyChunkNodataMask =
    5818       16230 :                         apabyChunkNoDataMask[iBand];
    5819       16230 :                     poJob->args.nChunkXOff = nChunkXOffQueried;
    5820       16230 :                     poJob->args.nChunkXSize = nChunkXSizeQueried;
    5821       16230 :                     poJob->args.nChunkYOff = nChunkYOffQueried;
    5822       16230 :                     poJob->args.nChunkYSize = nChunkYSizeQueried;
    5823       16230 :                     poJob->args.nDstXOff = nDstXOff;
    5824       16230 :                     poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
    5825       16230 :                     poJob->args.nDstYOff = nDstYOff;
    5826       16230 :                     poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
    5827       16230 :                     poJob->args.pszResampling = pszResampling;
    5828       16230 :                     poJob->args.bHasNoData = pabHasNoData[iBand];
    5829       16230 :                     poJob->args.dfNoDataValue = padfNoDataValue[iBand];
    5830       16230 :                     poJob->args.eSrcDataType = eDataType;
    5831       16230 :                     poJob->args.bPropagateNoData = bPropagateNoData;
    5832             : 
    5833       16230 :                     if (poJobQueue)
    5834             :                     {
    5835       16224 :                         poJob->oSrcMaskBufferHolder.reset(
    5836        8112 :                             new PointerHolder(apabyChunkNoDataMask[iBand]));
    5837        8112 :                         apabyChunkNoDataMask[iBand] = nullptr;
    5838             : 
    5839       16224 :                         poJob->oSrcBufferHolder.reset(
    5840        8112 :                             new PointerHolder(apaChunk[iBand]));
    5841        8112 :                         apaChunk[iBand] = nullptr;
    5842             : 
    5843        8112 :                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    5844        8112 :                         jobList.emplace_back(std::move(poJob));
    5845             :                     }
    5846             :                     else
    5847             :                     {
    5848        8118 :                         JobResampleFunc(poJob.get());
    5849        8118 :                         eErr = poJob->eErr;
    5850        8118 :                         if (eErr == CE_None)
    5851             :                         {
    5852        8118 :                             eErr = WriteJobData(poJob.get());
    5853             :                         }
    5854             :                     }
    5855             :                 }
    5856             :             }
    5857             :         }
    5858             : 
    5859             :         // Wait for all pending jobs to complete
    5860         588 :         while (!jobList.empty())
    5861             :         {
    5862          15 :             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5863          15 :             if (l_eErr != CE_None && eErr == CE_None)
    5864           0 :                 eErr = l_eErr;
    5865             :         }
    5866             : 
    5867             :         // Flush the data to overviews.
    5868        1708 :         for (int iBand = 0; iBand < nBands; ++iBand)
    5869             :         {
    5870        1135 :             CPLFree(apaChunk[iBand]);
    5871        1135 :             papapoOverviewBands[iBand][iOverview]->FlushCache(false);
    5872             : 
    5873        1135 :             CPLFree(apabyChunkNoDataMask[iBand]);
    5874             :         }
    5875             :     }
    5876             : 
    5877         352 :     CPLFree(pabHasNoData);
    5878         352 :     CPLFree(padfNoDataValue);
    5879             : 
    5880         352 :     if (eErr == CE_None)
    5881         350 :         pfnProgress(1.0, nullptr, pProgressData);
    5882             : 
    5883         352 :     return eErr;
    5884             : }
    5885             : 
    5886             : /************************************************************************/
    5887             : /*            GDALRegenerateOverviewsMultiBand()                        */
    5888             : /************************************************************************/
    5889             : 
    5890             : /**
    5891             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5892             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5893             :  *
    5894             :  * This function will generate one or more overview images from a base
    5895             :  * image using the requested downsampling algorithm.  Its primary use
    5896             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5897             :  * can also be used to generate downsampled images in one file from another
    5898             :  * outside the overview architecture.
    5899             :  *
    5900             :  * The output bands need to exist in advance and share the same characteristics
    5901             :  * (type, dimensions)
    5902             :  *
    5903             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5904             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5905             :  *
    5906             :  * It does not support color tables or complex data types.
    5907             :  *
    5908             :  * The pseudo-algorithm used by the function is :
    5909             :  *    for each overview
    5910             :  *       iterate on lines of the source by a step of deltay
    5911             :  *           iterate on columns of the source  by a step of deltax
    5912             :  *               read the source data of size deltax * deltay for all the bands
    5913             :  *               generate the corresponding overview block for all the bands
    5914             :  *
    5915             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5916             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5917             :  * considered as the nodata value and not each value of the triplet
    5918             :  * independently per band.
    5919             :  *
    5920             :  * The GDAL_NUM_THREADS configuration option can be set
    5921             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5922             :  * overview computation.
    5923             :  *
    5924             :  * @param apoSrcBands the list of source bands to downsample
    5925             :  * @param aapoOverviewBands bidimension array of bands. First dimension is
    5926             :  *                          indexed by bands. Second dimension is indexed by
    5927             :  *                          overview levels. All aapoOverviewBands[i] arrays
    5928             :  *                          must have the same size (i.e. same number of
    5929             :  *                          overviews)
    5930             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5931             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5932             :  * @param pfnProgress progress report function.
    5933             :  * @param pProgressData progress function callback data.
    5934             :  * @param papszOptions NULL terminated list of options as
    5935             :  *                     key=value pairs, or NULL
    5936             :  *                     The XOFF, YOFF, XSIZE and YSIZE
    5937             :  *                     options can be specified to express that overviews should
    5938             :  *                     be regenerated only in the specified subset of the source
    5939             :  *                     dataset.
    5940             :  * @return CE_None on success or CE_Failure on failure.
    5941             :  * @since 3.10
    5942             :  */
    5943             : 
    5944           5 : CPLErr GDALRegenerateOverviewsMultiBand(
    5945             :     const std::vector<GDALRasterBand *> &apoSrcBands,
    5946             :     const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
    5947             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5948             :     void *pProgressData, CSLConstList papszOptions)
    5949             : {
    5950           5 :     CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
    5951          15 :     for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
    5952             :     {
    5953          10 :         CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
    5954             :     }
    5955             : 
    5956           5 :     if (aapoOverviewBands.empty())
    5957           0 :         return CE_None;
    5958             : 
    5959           5 :     std::vector<GDALRasterBand **> apapoOverviewBands;
    5960          20 :     for (auto &apoOverviewBands : aapoOverviewBands)
    5961             :     {
    5962             :         auto papoOverviewBands = static_cast<GDALRasterBand **>(
    5963          15 :             CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
    5964          30 :         for (size_t i = 0; i < apoOverviewBands.size(); ++i)
    5965             :         {
    5966          15 :             papoOverviewBands[i] = apoOverviewBands[i];
    5967             :         }
    5968          15 :         apapoOverviewBands.push_back(papoOverviewBands);
    5969             :     }
    5970          10 :     const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
    5971           5 :         static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
    5972           5 :         static_cast<int>(aapoOverviewBands[0].size()),
    5973           5 :         apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
    5974             :         papszOptions);
    5975          20 :     for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
    5976          15 :         CPLFree(papoOverviewBands);
    5977           5 :     return eErr;
    5978             : }
    5979             : 
    5980             : /************************************************************************/
    5981             : /*                        GDALComputeBandStats()                        */
    5982             : /************************************************************************/
    5983             : 
    5984             : /** Undocumented
    5985             :  * @param hSrcBand undocumented.
    5986             :  * @param nSampleStep Step between scanlines used to compute statistics.
    5987             :  *                    When nSampleStep is equal to 1, all scanlines will
    5988             :  *                    be processed.
    5989             :  * @param pdfMean undocumented.
    5990             :  * @param pdfStdDev undocumented.
    5991             :  * @param pfnProgress undocumented.
    5992             :  * @param pProgressData undocumented.
    5993             :  * @return undocumented
    5994             :  */
    5995          16 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
    5996             :                                         int nSampleStep, double *pdfMean,
    5997             :                                         double *pdfStdDev,
    5998             :                                         GDALProgressFunc pfnProgress,
    5999             :                                         void *pProgressData)
    6000             : 
    6001             : {
    6002          16 :     VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
    6003             : 
    6004          16 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    6005             : 
    6006          16 :     if (pfnProgress == nullptr)
    6007          16 :         pfnProgress = GDALDummyProgress;
    6008             : 
    6009          16 :     const int nWidth = poSrcBand->GetXSize();
    6010          16 :     const int nHeight = poSrcBand->GetYSize();
    6011             : 
    6012          16 :     if (nSampleStep >= nHeight || nSampleStep < 1)
    6013           3 :         nSampleStep = 1;
    6014             : 
    6015          16 :     GDALDataType eWrkType = GDT_Unknown;
    6016          16 :     float *pafData = nullptr;
    6017          16 :     GDALDataType eType = poSrcBand->GetRasterDataType();
    6018          16 :     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6019          16 :     if (bComplex)
    6020             :     {
    6021             :         pafData = static_cast<float *>(
    6022           0 :             VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
    6023           0 :         eWrkType = GDT_CFloat32;
    6024             :     }
    6025             :     else
    6026             :     {
    6027             :         pafData =
    6028          16 :             static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
    6029          16 :         eWrkType = GDT_Float32;
    6030             :     }
    6031             : 
    6032          16 :     if (nWidth == 0 || pafData == nullptr)
    6033             :     {
    6034           0 :         VSIFree(pafData);
    6035           0 :         return CE_Failure;
    6036             :     }
    6037             : 
    6038             :     /* -------------------------------------------------------------------- */
    6039             :     /*      Loop over all sample lines.                                     */
    6040             :     /* -------------------------------------------------------------------- */
    6041          16 :     double dfSum = 0.0;
    6042          16 :     double dfSum2 = 0.0;
    6043          16 :     int iLine = 0;
    6044          16 :     GIntBig nSamples = 0;
    6045             : 
    6046        2143 :     do
    6047             :     {
    6048        2159 :         if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6049             :                          pProgressData))
    6050             :         {
    6051           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6052           0 :             CPLFree(pafData);
    6053           0 :             return CE_Failure;
    6054             :         }
    6055             : 
    6056             :         const CPLErr eErr =
    6057        2159 :             poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
    6058             :                                 1, eWrkType, 0, 0, nullptr);
    6059        2159 :         if (eErr != CE_None)
    6060             :         {
    6061           1 :             CPLFree(pafData);
    6062           1 :             return eErr;
    6063             :         }
    6064             : 
    6065      725204 :         for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6066             :         {
    6067      723046 :             float fValue = 0.0f;
    6068             : 
    6069      723046 :             if (bComplex)
    6070             :             {
    6071             :                 // Compute the magnitude of the complex value.
    6072             :                 fValue =
    6073           0 :                     std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
    6074             :             }
    6075             :             else
    6076             :             {
    6077      723046 :                 fValue = pafData[iPixel];
    6078             :             }
    6079             : 
    6080      723046 :             dfSum += fValue;
    6081      723046 :             dfSum2 += static_cast<double>(fValue) * fValue;
    6082             :         }
    6083             : 
    6084        2158 :         nSamples += nWidth;
    6085        2158 :         iLine += nSampleStep;
    6086        2158 :     } while (iLine < nHeight);
    6087             : 
    6088          15 :     if (!pfnProgress(1.0, nullptr, pProgressData))
    6089             :     {
    6090           0 :         CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6091           0 :         CPLFree(pafData);
    6092           0 :         return CE_Failure;
    6093             :     }
    6094             : 
    6095             :     /* -------------------------------------------------------------------- */
    6096             :     /*      Produce the result values.                                      */
    6097             :     /* -------------------------------------------------------------------- */
    6098          15 :     if (pdfMean != nullptr)
    6099          15 :         *pdfMean = dfSum / nSamples;
    6100             : 
    6101          15 :     if (pdfStdDev != nullptr)
    6102             :     {
    6103          15 :         const double dfMean = dfSum / nSamples;
    6104             : 
    6105          15 :         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
    6106             :     }
    6107             : 
    6108          15 :     CPLFree(pafData);
    6109             : 
    6110          15 :     return CE_None;
    6111             : }
    6112             : 
    6113             : /************************************************************************/
    6114             : /*                  GDALOverviewMagnitudeCorrection()                   */
    6115             : /*                                                                      */
    6116             : /*      Correct the mean and standard deviation of the overviews of     */
    6117             : /*      the given band to match the base layer approximately.           */
    6118             : /************************************************************************/
    6119             : 
    6120             : /** Undocumented
    6121             :  * @param hBaseBand undocumented.
    6122             :  * @param nOverviewCount undocumented.
    6123             :  * @param pahOverviews undocumented.
    6124             :  * @param pfnProgress undocumented.
    6125             :  * @param pProgressData undocumented.
    6126             :  * @return undocumented
    6127             :  */
    6128           0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
    6129             :                                        int nOverviewCount,
    6130             :                                        GDALRasterBandH *pahOverviews,
    6131             :                                        GDALProgressFunc pfnProgress,
    6132             :                                        void *pProgressData)
    6133             : 
    6134             : {
    6135           0 :     VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
    6136             : 
    6137             :     /* -------------------------------------------------------------------- */
    6138             :     /*      Compute mean/stddev for source raster.                          */
    6139             :     /* -------------------------------------------------------------------- */
    6140           0 :     double dfOrigMean = 0.0;
    6141           0 :     double dfOrigStdDev = 0.0;
    6142             :     {
    6143             :         const CPLErr eErr =
    6144           0 :             GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
    6145             :                                  pfnProgress, pProgressData);
    6146             : 
    6147           0 :         if (eErr != CE_None)
    6148           0 :             return eErr;
    6149             :     }
    6150             : 
    6151             :     /* -------------------------------------------------------------------- */
    6152             :     /*      Loop on overview bands.                                         */
    6153             :     /* -------------------------------------------------------------------- */
    6154           0 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    6155             :     {
    6156             :         GDALRasterBand *poOverview =
    6157           0 :             GDALRasterBand::FromHandle(pahOverviews[iOverview]);
    6158             :         double dfOverviewMean, dfOverviewStdDev;
    6159             : 
    6160             :         const CPLErr eErr =
    6161           0 :             GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
    6162             :                                  &dfOverviewStdDev, pfnProgress, pProgressData);
    6163             : 
    6164           0 :         if (eErr != CE_None)
    6165           0 :             return eErr;
    6166             : 
    6167           0 :         double dfGain = 1.0;
    6168           0 :         if (dfOrigStdDev >= 0.0001)
    6169           0 :             dfGain = dfOrigStdDev / dfOverviewStdDev;
    6170             : 
    6171             :         /* --------------------------------------------------------------------
    6172             :          */
    6173             :         /*      Apply gain and offset. */
    6174             :         /* --------------------------------------------------------------------
    6175             :          */
    6176           0 :         const int nWidth = poOverview->GetXSize();
    6177           0 :         const int nHeight = poOverview->GetYSize();
    6178             : 
    6179           0 :         GDALDataType eWrkType = GDT_Unknown;
    6180           0 :         float *pafData = nullptr;
    6181           0 :         const GDALDataType eType = poOverview->GetRasterDataType();
    6182           0 :         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6183           0 :         if (bComplex)
    6184             :         {
    6185             :             pafData = static_cast<float *>(
    6186           0 :                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6187           0 :             eWrkType = GDT_CFloat32;
    6188             :         }
    6189             :         else
    6190             :         {
    6191             :             pafData = static_cast<float *>(
    6192           0 :                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6193           0 :             eWrkType = GDT_Float32;
    6194             :         }
    6195             : 
    6196           0 :         if (pafData == nullptr)
    6197             :         {
    6198           0 :             return CE_Failure;
    6199             :         }
    6200             : 
    6201           0 :         for (int iLine = 0; iLine < nHeight; ++iLine)
    6202             :         {
    6203           0 :             if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6204             :                              pProgressData))
    6205             :             {
    6206           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6207           0 :                 CPLFree(pafData);
    6208           0 :                 return CE_Failure;
    6209             :             }
    6210             : 
    6211           0 :             if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
    6212             :                                      nWidth, 1, eWrkType, 0, 0,
    6213           0 :                                      nullptr) != CE_None)
    6214             :             {
    6215           0 :                 CPLFree(pafData);
    6216           0 :                 return CE_Failure;
    6217             :             }
    6218             : 
    6219           0 :             for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6220             :             {
    6221           0 :                 if (bComplex)
    6222             :                 {
    6223           0 :                     pafData[iPixel * 2] *= static_cast<float>(dfGain);
    6224           0 :                     pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
    6225             :                 }
    6226             :                 else
    6227             :                 {
    6228           0 :                     pafData[iPixel] = static_cast<float>(
    6229           0 :                         (pafData[iPixel] - dfOverviewMean) * dfGain +
    6230             :                         dfOrigMean);
    6231             :                 }
    6232             :             }
    6233             : 
    6234           0 :             if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
    6235             :                                      nWidth, 1, eWrkType, 0, 0,
    6236           0 :                                      nullptr) != CE_None)
    6237             :             {
    6238           0 :                 CPLFree(pafData);
    6239           0 :                 return CE_Failure;
    6240             :             }
    6241             :         }
    6242             : 
    6243           0 :         if (!pfnProgress(1.0, nullptr, pProgressData))
    6244             :         {
    6245           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6246           0 :             CPLFree(pafData);
    6247           0 :             return CE_Failure;
    6248             :         }
    6249             : 
    6250           0 :         CPLFree(pafData);
    6251             :     }
    6252             : 
    6253           0 :     return CE_None;
    6254             : }

Generated by: LCOV version 1.14