LCOV - code coverage report
Current view: top level - gcore - overview.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2425 2792 86.9 %
Date: 2025-02-20 10:14:44 Functions: 110 135 81.5 %

          Line data    Source code
       1             : 
       2             : /******************************************************************************
       3             :  *
       4             :  * Project:  GDAL Core
       5             :  * Purpose:  Helper code to implement overview support in different drivers.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2000, Frank Warmerdam
      10             :  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * SPDX-License-Identifier: MIT
      13             :  ****************************************************************************/
      14             : 
      15             : #include "cpl_port.h"
      16             : #include "gdal_priv.h"
      17             : 
      18             : #include <cmath>
      19             : #include <cstddef>
      20             : #include <cstdlib>
      21             : 
      22             : #include <algorithm>
      23             : #include <complex>
      24             : #include <condition_variable>
      25             : #include <limits>
      26             : #include <list>
      27             : #include <memory>
      28             : #include <mutex>
      29             : #include <vector>
      30             : 
      31             : #include "cpl_conv.h"
      32             : #include "cpl_error.h"
      33             : #include "cpl_float.h"
      34             : #include "cpl_progress.h"
      35             : #include "cpl_vsi.h"
      36             : #include "gdal.h"
      37             : #include "gdal_thread_pool.h"
      38             : #include "gdalwarper.h"
      39             : 
      40             : #ifdef USE_NEON_OPTIMIZATIONS
      41             : #include "include_sse2neon.h"
      42             : #define USE_SSE2
      43             : 
      44             : #include "gdalsse_priv.h"
      45             : 
      46             : // Restrict to 64bit processors because they are guaranteed to have SSE2,
      47             : // or if __AVX2__ is defined.
      48             : #elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
      49             : #define USE_SSE2
      50             : 
      51             : #include "gdalsse_priv.h"
      52             : 
      53             : #ifdef __SSE3__
      54             : #include <pmmintrin.h>
      55             : #endif
      56             : #ifdef __SSSE3__
      57             : #include <tmmintrin.h>
      58             : #endif
      59             : #ifdef __SSE4_1__
      60             : #include <smmintrin.h>
      61             : #endif
      62             : #ifdef __AVX2__
      63             : #include <immintrin.h>
      64             : #endif
      65             : 
      66             : #endif
      67             : 
      68             : // To be included after above USE_SSE2 and include gdalsse_priv.h
      69             : // to avoid build issue on Windows x86
      70             : #include "gdal_priv_templates.hpp"
      71             : 
      72             : /************************************************************************/
      73             : /*                      GDALResampleChunk_Near()                        */
      74             : /************************************************************************/
      75             : 
      76             : template <class T>
      77        6095 : static CPLErr GDALResampleChunk_NearT(const GDALOverviewResampleArgs &args,
      78             :                                       const T *pChunk, T **ppDstBuffer)
      79             : 
      80             : {
      81        6095 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
      82        6095 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
      83        6095 :     const GDALDataType eWrkDataType = args.eWrkDataType;
      84        6095 :     const int nChunkXOff = args.nChunkXOff;
      85        6095 :     const int nChunkXSize = args.nChunkXSize;
      86        6095 :     const int nChunkYOff = args.nChunkYOff;
      87        6095 :     const int nDstXOff = args.nDstXOff;
      88        6095 :     const int nDstXOff2 = args.nDstXOff2;
      89        6095 :     const int nDstYOff = args.nDstYOff;
      90        6095 :     const int nDstYOff2 = args.nDstYOff2;
      91        6095 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
      92             : 
      93             :     /* -------------------------------------------------------------------- */
      94             :     /*      Allocate buffers.                                               */
      95             :     /* -------------------------------------------------------------------- */
      96        6095 :     *ppDstBuffer = static_cast<T *>(
      97        6095 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
      98             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
      99        6095 :     if (*ppDstBuffer == nullptr)
     100             :     {
     101           0 :         return CE_Failure;
     102             :     }
     103        6095 :     T *const pDstBuffer = *ppDstBuffer;
     104             : 
     105             :     int *panSrcXOff =
     106        6095 :         static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
     107             : 
     108        6095 :     if (panSrcXOff == nullptr)
     109             :     {
     110           0 :         VSIFree(panSrcXOff);
     111           0 :         return CE_Failure;
     112             :     }
     113             : 
     114             :     /* ==================================================================== */
     115             :     /*      Precompute inner loop constants.                                */
     116             :     /* ==================================================================== */
     117      592860 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
     118             :     {
     119      586765 :         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
     120      586765 :         if (nSrcXOff < nChunkXOff)
     121           0 :             nSrcXOff = nChunkXOff;
     122             : 
     123      586765 :         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
     124             :     }
     125             : 
     126             :     /* ==================================================================== */
     127             :     /*      Loop over destination scanlines.                                */
     128             :     /* ==================================================================== */
     129      216591 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
     130             :     {
     131      210496 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
     132      210496 :         if (nSrcYOff < nChunkYOff)
     133           0 :             nSrcYOff = nChunkYOff;
     134             : 
     135      210496 :         const T *const pSrcScanline =
     136             :             pChunk +
     137      210496 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
     138      208026 :             nChunkXOff;
     139             : 
     140             :         /* --------------------------------------------------------------------
     141             :          */
     142             :         /*      Loop over destination pixels */
     143             :         /* --------------------------------------------------------------------
     144             :          */
     145      210496 :         T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
     146   119221034 :         for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
     147             :         {
     148   119010564 :             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
     149             :         }
     150             :     }
     151             : 
     152        6095 :     CPLFree(panSrcXOff);
     153             : 
     154        6095 :     return CE_None;
     155             : }
     156             : 
     157        6095 : static CPLErr GDALResampleChunk_Near(const GDALOverviewResampleArgs &args,
     158             :                                      const void *pChunk, void **ppDstBuffer,
     159             :                                      GDALDataType *peDstBufferDataType)
     160             : {
     161        6095 :     *peDstBufferDataType = args.eWrkDataType;
     162        6095 :     switch (args.eWrkDataType)
     163             :     {
     164             :         // For nearest resampling, as no computation is done, only the
     165             :         // size of the data type matters.
     166        5967 :         case GDT_Byte:
     167             :         case GDT_Int8:
     168             :         {
     169        5967 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 1);
     170        5967 :             return GDALResampleChunk_NearT(
     171             :                 args, static_cast<const uint8_t *>(pChunk),
     172        5967 :                 reinterpret_cast<uint8_t **>(ppDstBuffer));
     173             :         }
     174             : 
     175          26 :         case GDT_Int16:
     176             :         case GDT_UInt16:
     177             :         case GDT_Float16:
     178             :         {
     179          26 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
     180          26 :             return GDALResampleChunk_NearT(
     181             :                 args, static_cast<const uint16_t *>(pChunk),
     182          26 :                 reinterpret_cast<uint16_t **>(ppDstBuffer));
     183             :         }
     184             : 
     185          55 :         case GDT_CInt16:
     186             :         case GDT_CFloat16:
     187             :         case GDT_Int32:
     188             :         case GDT_UInt32:
     189             :         case GDT_Float32:
     190             :         {
     191          55 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
     192          55 :             return GDALResampleChunk_NearT(
     193             :                 args, static_cast<const uint32_t *>(pChunk),
     194          55 :                 reinterpret_cast<uint32_t **>(ppDstBuffer));
     195             :         }
     196             : 
     197          43 :         case GDT_CInt32:
     198             :         case GDT_CFloat32:
     199             :         case GDT_Int64:
     200             :         case GDT_UInt64:
     201             :         case GDT_Float64:
     202             :         {
     203          43 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
     204          43 :             return GDALResampleChunk_NearT(
     205             :                 args, static_cast<const uint64_t *>(pChunk),
     206          43 :                 reinterpret_cast<uint64_t **>(ppDstBuffer));
     207             :         }
     208             : 
     209           4 :         case GDT_CFloat64:
     210             :         {
     211           4 :             return GDALResampleChunk_NearT(
     212             :                 args, static_cast<const std::complex<double> *>(pChunk),
     213           4 :                 reinterpret_cast<std::complex<double> **>(ppDstBuffer));
     214             :         }
     215             : 
     216           0 :         case GDT_Unknown:
     217             :         case GDT_TypeCount:
     218           0 :             break;
     219             :     }
     220           0 :     CPLAssert(false);
     221             :     return CE_Failure;
     222             : }
     223             : 
     224             : namespace
     225             : {
     226             : 
     227             : // Find in the color table the entry whose RGB value is the closest
     228             : // (using quadratic distance) to the test color, ignoring transparent entries.
     229        3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
     230             :                    const GDALColorEntry &test)
     231             : {
     232        3837 :     int nMinDist = std::numeric_limits<int>::max();
     233        3837 :     size_t bestEntry = 0;
     234      986109 :     for (size_t i = 0; i < entries.size(); ++i)
     235             :     {
     236      982272 :         const GDALColorEntry &entry = entries[i];
     237             :         // Ignore transparent entries
     238      982272 :         if (entry.c4 == 0)
     239        3237 :             continue;
     240             : 
     241      979035 :         int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
     242      979035 :                     ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
     243      979035 :                     ((test.c3 - entry.c3) * (test.c3 - entry.c3));
     244      979035 :         if (nDist < nMinDist)
     245             :         {
     246       15847 :             nMinDist = nDist;
     247       15847 :             bestEntry = i;
     248             :         }
     249             :     }
     250        3837 :     return static_cast<int>(bestEntry);
     251             : }
     252             : 
     253           7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
     254             :                                            int &transparentIdx)
     255             : {
     256           7 :     std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
     257             : 
     258           7 :     transparentIdx = -1;
     259           7 :     int i = 0;
     260        1799 :     for (auto &entry : entries)
     261             :     {
     262        1792 :         table.GetColorEntryAsRGB(i, &entry);
     263        1792 :         if (transparentIdx < 0 && entry.c4 == 0)
     264           1 :             transparentIdx = i;
     265        1792 :         ++i;
     266             :     }
     267           7 :     return entries;
     268             : }
     269             : 
     270             : }  // unnamed  namespace
     271             : 
     272             : /************************************************************************/
     273             : /*                             SQUARE()                                 */
     274             : /************************************************************************/
     275             : 
     276        3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
     277             : {
     278        3721 :     return static_cast<Tsquare>(val) * val;
     279             : }
     280             : 
     281             : /************************************************************************/
     282             : /*                          ComputeIntegerRMS()                         */
     283             : /************************************************************************/
     284             : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
     285             : // integer that minimizes abs(rms**2 - sumSquares / weight)
     286             : template <class T, class Twork>
     287          42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
     288             : {
     289          42 :     const double sumDivWeight = sumSquares / weight;
     290          42 :     T rms = static_cast<T>(sqrt(sumDivWeight));
     291             : 
     292             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     293             :     // Naive version:
     294             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     295          42 :     if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
     296          42 :         2 * sumDivWeight)
     297           6 :         rms += 1;
     298          42 :     return rms;
     299             : }
     300             : 
     301           0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
     302             : {
     303           0 :     CPLAssert(false);
     304             :     return 0;
     305             : }
     306             : 
     307          24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
     308             : {
     309             :     // It has been verified that given the correction on rms below, using
     310             :     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
     311             :     // is equivalent, so use the former as it is used twice.
     312          24 :     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
     313          24 :     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
     314          24 :     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
     315             : 
     316             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     317             :     // Naive version:
     318             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     319             :     // Optimized version for integer case and weight == 4
     320          24 :     if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
     321           5 :         rms += 1;
     322          24 :     return rms;
     323             : }
     324             : 
     325             : template <>
     326          20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
     327             : {
     328          20 :     const double sumDivWeight = sumSquares * 0.25;
     329          20 :     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
     330             : 
     331             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     332             :     // Naive version:
     333             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     334             :     // Optimized version for integer case and weight == 4
     335          20 :     if (static_cast<GUInt32>(rms) * (rms + 1) <
     336          20 :         static_cast<GUInt32>(sumDivWeight + 0.25))
     337           4 :         rms += 1;
     338          20 :     return rms;
     339             : }
     340             : 
     341             : #ifdef USE_SSE2
     342             : 
     343             : /************************************************************************/
     344             : /*                   QuadraticMeanByteSSE2OrAVX2()                      */
     345             : /************************************************************************/
     346             : 
     347             : #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
     348             : #define sse2_packus_epi32 _mm_packus_epi32
     349             : #else
     350      516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
     351             : {
     352      516119 :     const auto minus32768_32 = _mm_set1_epi32(-32768);
     353      516119 :     const auto minus32768_16 = _mm_set1_epi16(-32768);
     354      516119 :     a = _mm_add_epi32(a, minus32768_32);
     355      516119 :     b = _mm_add_epi32(b, minus32768_32);
     356      516119 :     a = _mm_packs_epi32(a, b);
     357      516119 :     a = _mm_sub_epi16(a, minus32768_16);
     358      516119 :     return a;
     359             : }
     360             : #endif
     361             : 
     362             : #if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
     363             : #define sse2_hadd_epi16 _mm_hadd_epi16
     364             : #else
     365     4660840 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
     366             : {
     367             :     // Horizontal addition of adjacent pairs
     368     4660840 :     const auto mask = _mm_set1_epi32(0xFFFF);
     369             :     const auto horizLo =
     370    13982500 :         _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
     371             :     const auto horizHi =
     372    13982500 :         _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
     373             : 
     374             :     // Recombine low and high parts
     375     4660840 :     return _mm_packs_epi32(horizLo, horizHi);
     376             : }
     377             : #endif
     378             : 
     379             : #ifdef __AVX2__
     380             : 
     381             : #define DEST_ELTS 16
     382             : #define set1_epi16 _mm256_set1_epi16
     383             : #define set1_epi32 _mm256_set1_epi32
     384             : #define setzero _mm256_setzero_si256
     385             : #define set1_ps _mm256_set1_ps
     386             : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
     387             : #define unpacklo_epi8 _mm256_unpacklo_epi8
     388             : #define unpackhi_epi8 _mm256_unpackhi_epi8
     389             : #define madd_epi16 _mm256_madd_epi16
     390             : #define add_epi32 _mm256_add_epi32
     391             : #define mul_ps _mm256_mul_ps
     392             : #define cvtepi32_ps _mm256_cvtepi32_ps
     393             : #define sqrt_ps _mm256_sqrt_ps
     394             : #define cvttps_epi32 _mm256_cvttps_epi32
     395             : #define packs_epi32 _mm256_packs_epi32
     396             : #define packus_epi32 _mm256_packus_epi32
     397             : #define srli_epi32 _mm256_srli_epi32
     398             : #define mullo_epi16 _mm256_mullo_epi16
     399             : #define srli_epi16 _mm256_srli_epi16
     400             : #define cmpgt_epi16 _mm256_cmpgt_epi16
     401             : #define add_epi16 _mm256_add_epi16
     402             : #define sub_epi16 _mm256_sub_epi16
     403             : #define packus_epi16 _mm256_packus_epi16
     404             : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
     405             : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
     406             :  */
     407             : #define store_lo(x, y)                                                         \
     408             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
     409             :                      _mm256_extracti128_si256(                                 \
     410             :                          _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
     411             : #define hadd_epi16 _mm256_hadd_epi16
     412             : #define zeroupper() _mm256_zeroupper()
     413             : #else
     414             : #define DEST_ELTS 8
     415             : #define set1_epi16 _mm_set1_epi16
     416             : #define set1_epi32 _mm_set1_epi32
     417             : #define setzero _mm_setzero_si128
     418             : #define set1_ps _mm_set1_ps
     419             : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
     420             : #define unpacklo_epi8 _mm_unpacklo_epi8
     421             : #define unpackhi_epi8 _mm_unpackhi_epi8
     422             : #define madd_epi16 _mm_madd_epi16
     423             : #define add_epi32 _mm_add_epi32
     424             : #define mul_ps _mm_mul_ps
     425             : #define cvtepi32_ps _mm_cvtepi32_ps
     426             : #define sqrt_ps _mm_sqrt_ps
     427             : #define cvttps_epi32 _mm_cvttps_epi32
     428             : #define packs_epi32 _mm_packs_epi32
     429             : #define packus_epi32 sse2_packus_epi32
     430             : #define srli_epi32 _mm_srli_epi32
     431             : #define mullo_epi16 _mm_mullo_epi16
     432             : #define srli_epi16 _mm_srli_epi16
     433             : #define cmpgt_epi16 _mm_cmpgt_epi16
     434             : #define add_epi16 _mm_add_epi16
     435             : #define sub_epi16 _mm_sub_epi16
     436             : #define packus_epi16 _mm_packus_epi16
     437             : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
     438             : #define hadd_epi16 sse2_hadd_epi16
     439             : #define zeroupper() (void)0
     440             : #endif
     441             : 
     442             : #if defined(__GNUC__) && defined(__AVX2__)
     443             : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
     444             : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
     445             : // where the registry that contains minus_zero is correctly
     446             : // loaded the first time the function is called (looking at the disassembly,
     447             : // one sees it is loaded much earlier than the function), but gets corrupted
     448             : // (zeroed) in following iterations.
     449             : // It appears the bug is due to the explicit zeroupper() call at the end of
     450             : // the function.
     451             : // The bug is at least solved in gcc 10.2.
     452             : // Inlining doesn't bring much here to performance.
     453             : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
     454             : // -O3 -mavx2 mode
     455             : #define NOINLINE __attribute__((noinline))
     456             : #else
     457             : #define NOINLINE
     458             : #endif
     459             : 
     460             : template <class T>
     461             : static int NOINLINE
     462        5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     463             :                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     464             :                             T *CPL_RESTRICT pDstScanline)
     465             : {
     466             :     // Optimized implementation for RMS on Byte by
     467             :     // processing by group of 8 output pixels, so as to use
     468             :     // a single _mm_sqrt_ps() call for 4 output pixels
     469        5385 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     470             : 
     471        5385 :     int iDstPixel = 0;
     472        5385 :     const auto one16 = set1_epi16(1);
     473        5385 :     const auto one32 = set1_epi32(1);
     474        5385 :     const auto zero = setzero();
     475        5385 :     const auto minus32768 = set1_epi16(-32768);
     476             : 
     477      521496 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     478             :     {
     479             :         // Load 2 * DEST_ELTS bytes from each line
     480      516111 :         auto firstLine = loadu_int(pSrcScanlineShifted);
     481     1032220 :         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     482             :         // Extend those Bytes as UInt16s
     483      516111 :         auto firstLineLo = unpacklo_epi8(firstLine, zero);
     484      516111 :         auto firstLineHi = unpackhi_epi8(firstLine, zero);
     485      516111 :         auto secondLineLo = unpacklo_epi8(secondLine, zero);
     486      516111 :         auto secondLineHi = unpackhi_epi8(secondLine, zero);
     487             : 
     488             :         // Multiplication of 16 bit values and horizontal
     489             :         // addition of 32 bit results
     490             :         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
     491      516111 :         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
     492      516111 :         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
     493      516111 :         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
     494      516111 :         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
     495             : 
     496             :         // Vertical addition
     497      516111 :         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
     498      516111 :         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
     499             : 
     500             :         const auto sumSquaresPlusOneDiv4Lo =
     501     1032220 :             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
     502             :         const auto sumSquaresPlusOneDiv4Hi =
     503     1032220 :             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
     504             : 
     505             :         // Take square root and truncate/floor to int32
     506             :         const auto rmsLo =
     507     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
     508             :         const auto rmsHi =
     509     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
     510             : 
     511             :         // Merge back low and high registers with each RMS value
     512             :         // as a 16 bit value.
     513      516111 :         auto rms = packs_epi32(rmsLo, rmsHi);
     514             : 
     515             :         // Round to upper value if it minimizes the
     516             :         // error |rms^2 - sumSquares/4|
     517             :         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     518             :         //    rms += 1;
     519             :         // which is equivalent to:
     520             :         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
     521             :         //    rms += 1;
     522             :         // And both left and right parts fit on 16 (unsigned) bits
     523             :         const auto sumSquaresPlusOneDiv4 =
     524      516111 :             packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
     525             :         // cmpgt_epi16 operates on signed int16, but here
     526             :         // we have unsigned values, so shift them by -32768 before
     527     2580560 :         auto mask = cmpgt_epi16(
     528             :             add_epi16(sumSquaresPlusOneDiv4, minus32768),
     529             :             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
     530             :         // The value of the mask will be -1 when the correction needs to be
     531             :         // applied
     532      516111 :         rms = sub_epi16(rms, mask);
     533             : 
     534             :         // Pack each 16 bit RMS value to 8 bits
     535      516111 :         rms = packus_epi16(rms, rms /* could be anything */);
     536      516111 :         store_lo(&pDstScanline[iDstPixel], rms);
     537      516111 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     538             :     }
     539             :     zeroupper();
     540             : 
     541        5385 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     542        5385 :     return iDstPixel;
     543             : }
     544             : 
     545             : /************************************************************************/
     546             : /*                      AverageByteSSE2OrAVX2()                         */
     547             : /************************************************************************/
     548             : 
     549             : template <class T>
     550             : static int
     551      111036 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     552             :                       const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     553             :                       T *CPL_RESTRICT pDstScanline)
     554             : {
     555             :     // Optimized implementation for average on Byte by
     556             :     // processing by group of 8 output pixels.
     557             : 
     558      111036 :     const auto zero = setzero();
     559      111036 :     const auto two16 = set1_epi16(2);
     560      111036 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     561             : 
     562      111036 :     int iDstPixel = 0;
     563     4771880 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     564             :     {
     565             :         // Load 2 * DEST_ELTS bytes from each line
     566     4660840 :         const auto firstLine = loadu_int(pSrcScanlineShifted);
     567     9321690 :         const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     568             :         // Extend those Bytes as UInt16s
     569     4660840 :         const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     570     4660840 :         const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     571     4660840 :         const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     572     4660840 :         const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     573             : 
     574             :         // Vertical addition
     575     4660840 :         const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     576     4660840 :         const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     577             : 
     578             :         // Horizontal addition of adjacent pairs, and recombine low and high
     579             :         // parts
     580     4660840 :         const auto sum = hadd_epi16(sumLo, sumHi);
     581             : 
     582             :         // average = (sum + 2) / 4
     583     9321690 :         auto average = srli_epi16(add_epi16(sum, two16), 2);
     584             : 
     585             :         // Pack each 16 bit average value to 8 bits
     586     4660840 :         average = packus_epi16(average, average /* could be anything */);
     587     4660840 :         store_lo(&pDstScanline[iDstPixel], average);
     588     4660840 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     589             :     }
     590             :     zeroupper();
     591             : 
     592      111036 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     593      111036 :     return iDstPixel;
     594             : }
     595             : 
     596             : /************************************************************************/
     597             : /*                     QuadraticMeanUInt16SSE2()                        */
     598             : /************************************************************************/
     599             : 
     600             : #ifdef __SSE3__
     601             : #define sse2_hadd_pd _mm_hadd_pd
     602             : #else
     603           8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
     604             : {
     605             :     auto aLo_bLo =
     606          32 :         _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
     607             :     auto aHi_bHi =
     608          32 :         _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
     609           8 :     return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
     610             : }
     611             : #endif
     612             : 
     613          40 : inline __m128d SQUARE(__m128d x)
     614             : {
     615          40 :     return _mm_mul_pd(x, x);
     616             : }
     617             : 
     618             : #ifdef __AVX2__
     619             : 
     620             : inline __m256d SQUARE(__m256d x)
     621             : {
     622             :     return _mm256_mul_pd(x, x);
     623             : }
     624             : 
     625             : inline __m256d FIXUP_LANES(__m256d x)
     626             : {
     627             :     return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
     628             : }
     629             : 
     630             : inline __m256 FIXUP_LANES(__m256 x)
     631             : {
     632             :     return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
     633             : }
     634             : 
     635             : #endif
     636             : 
     637             : template <class T>
     638             : static int
     639          10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
     640             :                         const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     641             :                         T *CPL_RESTRICT pDstScanline)
     642             : {
     643             :     // Optimized implementation for RMS on UInt16 by
     644             :     // processing by group of 4 output pixels.
     645          10 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     646             : 
     647          10 :     int iDstPixel = 0;
     648          10 :     const auto zero = _mm_setzero_si128();
     649             : 
     650             : #ifdef __AVX2__
     651             :     const auto zeroDot25 = _mm256_set1_pd(0.25);
     652             :     const auto zeroDot5 = _mm256_set1_pd(0.5);
     653             : 
     654             :     // The first four 0's could be anything, as we only take the bottom
     655             :     // 128 bits.
     656             :     const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
     657             : #else
     658          10 :     const auto zeroDot25 = _mm_set1_pd(0.25);
     659          10 :     const auto zeroDot5 = _mm_set1_pd(0.5);
     660             : #endif
     661             : 
     662          40 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
     663             :     {
     664             :         // Load 8 UInt16 from each line
     665          30 :         const auto firstLine = _mm_loadu_si128(
     666             :             reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     667             :         const auto secondLine =
     668          30 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     669          30 :                 pSrcScanlineShifted + nChunkXSize));
     670             : 
     671             :         // Detect if all of the source values fit in 14 bits.
     672             :         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
     673             :         // and we can do a much faster implementation.
     674             :         const auto maskTmp =
     675          60 :             _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
     676             : #if defined(__i386__) || defined(_M_IX86)
     677             :         uint64_t nMaskFitsIn14Bits = 0;
     678             :         _mm_storel_epi64(
     679             :             reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
     680             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     681             : #else
     682          30 :         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
     683             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     684             : #endif
     685          30 :         if (nMaskFitsIn14Bits == 0)
     686             :         {
     687             :             // Multiplication of 16 bit values and horizontal
     688             :             // addition of 32 bit results
     689             :             const auto firstLineHSumSquare =
     690          26 :                 _mm_madd_epi16(firstLine, firstLine);
     691             :             const auto secondLineHSumSquare =
     692          26 :                 _mm_madd_epi16(secondLine, secondLine);
     693             :             // Vertical addition
     694             :             const auto sumSquares =
     695          26 :                 _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
     696             :             // In theory we should take sqrt(sumSquares * 0.25f)
     697             :             // but given the rounding we do, this is equivalent to
     698             :             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
     699             :             // sumSquares <= 4 * 16383^2
     700          26 :             const auto one32 = _mm_set1_epi32(1);
     701             :             const auto sumSquaresPlusOneDiv4 =
     702          52 :                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
     703             :             // Take square root and truncate/floor to int32
     704          78 :             auto rms = _mm_cvttps_epi32(
     705             :                 _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
     706             : 
     707             :             // Round to upper value if it minimizes the
     708             :             // error |rms^2 - sumSquares/4|
     709             :             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     710             :             //    rms += 1;
     711             :             // which is equivalent to:
     712             :             // if( rms * rms + rms < (sumSquares+1) / 4 )
     713             :             //    rms += 1;
     714             :             auto mask =
     715          78 :                 _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
     716             :                                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
     717          26 :             rms = _mm_sub_epi32(rms, mask);
     718             :             // Pack each 32 bit RMS value to 16 bits
     719          26 :             rms = _mm_packs_epi32(rms, rms /* could be anything */);
     720             :             _mm_storel_epi64(
     721          26 :                 reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
     722          26 :             pSrcScanlineShifted += 8;
     723          26 :             continue;
     724             :         }
     725             : 
     726             :         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
     727             :         // to 32 bit would result in 4 multiplications instead of 8, but
     728             :         // mullo/mulhi have a worse throughput than mul_pd.
     729             : 
     730             :         // Extend those UInt16s as UInt32s
     731           4 :         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
     732           4 :         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
     733           4 :         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
     734           4 :         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
     735             : 
     736             : #ifdef __AVX2__
     737             :         // Multiplication of 32 bit values previously converted to 64 bit double
     738             :         const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
     739             :         const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
     740             :         const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
     741             :         const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
     742             : 
     743             :         // Vertical addition of squares
     744             :         const auto sumSquaresLo =
     745             :             _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
     746             :         const auto sumSquaresHi =
     747             :             _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
     748             : 
     749             :         // Horizontal addition of squares
     750             :         const auto sumSquares =
     751             :             FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
     752             : 
     753             :         const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
     754             : 
     755             :         // Take square root and truncate/floor to int32
     756             :         auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
     757             :         const auto rmsDouble = _mm256_cvtepi32_pd(rms);
     758             :         const auto right = _mm256_sub_pd(
     759             :             sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
     760             : 
     761             :         auto mask =
     762             :             _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
     763             :         // Extract 32-bit from each of the 4 64-bit masks
     764             :         // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
     765             :         // _MM_SHUFFLE(2,0,2,0)));
     766             :         mask = _mm256_permutevar8x32_ps(mask, permutation);
     767             :         const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
     768             : 
     769             :         // Apply the correction
     770             :         rms = _mm_sub_epi32(rms, maskI);
     771             : 
     772             :         // Pack each 32 bit RMS value to 16 bits
     773             :         rms = _mm_packus_epi32(rms, rms /* could be anything */);
     774             : #else
     775             :         // Multiplication of 32 bit values previously converted to 64 bit double
     776           4 :         const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
     777             :         const auto firstLineLoHi =
     778           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
     779           4 :         const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
     780             :         const auto firstLineHiHi =
     781           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
     782             : 
     783           4 :         const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
     784             :         const auto secondLineLoHi =
     785           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
     786           4 :         const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
     787             :         const auto secondLineHiHi =
     788           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
     789             : 
     790             :         // Vertical addition of squares
     791           4 :         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
     792           4 :         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
     793           4 :         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
     794           4 :         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
     795             : 
     796             :         // Horizontal addition of squares
     797           4 :         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
     798           4 :         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
     799             : 
     800           4 :         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
     801           4 :         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
     802             :         // Take square root and truncate/floor to int32
     803           8 :         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
     804           8 :         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
     805             : 
     806             :         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
     807             :         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
     808             :         //     rms += 1;
     809           4 :         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
     810           4 :         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
     811           8 :         const auto rightLo = _mm_sub_pd(
     812             :             sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
     813          12 :         const auto rightHi = _mm_sub_pd(
     814             :             sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
     815             : 
     816           8 :         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
     817           4 :         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
     818             :         // The value of the mask will be -1 when the correction needs to be
     819             :         // applied
     820           8 :         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
     821             :             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
     822             : 
     823          16 :         auto rms = _mm_castps_si128(
     824             :             _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
     825             :         // Apply the correction
     826           4 :         rms = _mm_sub_epi32(rms, mask);
     827             : 
     828             :         // Pack each 32 bit RMS value to 16 bits
     829           4 :         rms = sse2_packus_epi32(rms, rms /* could be anything */);
     830             : #endif
     831             : 
     832           4 :         _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     833             :                          rms);
     834           4 :         pSrcScanlineShifted += 8;
     835             :     }
     836             : 
     837             :     zeroupper();
     838             : 
     839          10 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     840          10 :     return iDstPixel;
     841             : }
     842             : 
     843             : /************************************************************************/
     844             : /*                         AverageUInt16SSE2()                          */
     845             : /************************************************************************/
     846             : 
     847             : template <class T>
     848           9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
     849             :                              const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     850             :                              T *CPL_RESTRICT pDstScanline)
     851             : {
     852             :     // Optimized implementation for average on UInt16 by
     853             :     // processing by group of 8 output pixels.
     854             : 
     855           9 :     const auto mask = _mm_set1_epi32(0xFFFF);
     856           9 :     const auto two = _mm_set1_epi32(2);
     857           9 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     858             : 
     859           9 :     int iDstPixel = 0;
     860          13 :     for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
     861             :     {
     862             :         __m128i averageLow;
     863             :         // Load 8 UInt16 from each line
     864             :         {
     865           4 :             const auto firstLine = _mm_loadu_si128(
     866             :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     867             :             const auto secondLine =
     868           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     869           4 :                     pSrcScanlineShifted + nChunkXSize));
     870             : 
     871             :             // Horizontal addition and extension to 32 bit
     872          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     873             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     874             :             const auto horizAddSecondLine =
     875          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     876             :                               _mm_srli_epi32(secondLine, 16));
     877             : 
     878             :             // Vertical addition and average computation
     879             :             // average = (sum + 2) >> 2
     880           8 :             const auto sum = _mm_add_epi32(
     881             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     882           4 :             averageLow = _mm_srli_epi32(sum, 2);
     883             :         }
     884             :         // Load 8 UInt16 from each line
     885             :         __m128i averageHigh;
     886             :         {
     887           4 :             const auto firstLine = _mm_loadu_si128(
     888           4 :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
     889             :             const auto secondLine =
     890           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     891           4 :                     pSrcScanlineShifted + 8 + nChunkXSize));
     892             : 
     893             :             // Horizontal addition and extension to 32 bit
     894          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     895             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     896             :             const auto horizAddSecondLine =
     897          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     898             :                               _mm_srli_epi32(secondLine, 16));
     899             : 
     900             :             // Vertical addition and average computation
     901             :             // average = (sum + 2) >> 2
     902           8 :             const auto sum = _mm_add_epi32(
     903             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     904           4 :             averageHigh = _mm_srli_epi32(sum, 2);
     905             :         }
     906             : 
     907             :         // Pack each 32 bit average value to 16 bits
     908           4 :         auto average = sse2_packus_epi32(averageLow, averageHigh);
     909           4 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     910             :                          average);
     911           4 :         pSrcScanlineShifted += 16;
     912             :     }
     913             : 
     914           9 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     915           9 :     return iDstPixel;
     916             : }
     917             : 
     918             : /************************************************************************/
     919             : /*                      QuadraticMeanFloatSSE2()                        */
     920             : /************************************************************************/
     921             : 
     922             : #ifdef __AVX2__
     923             : #define RMS_FLOAT_ELTS 8
     924             : #define set1_ps _mm256_set1_ps
     925             : #define loadu_ps _mm256_loadu_ps
     926             : #define andnot_ps _mm256_andnot_ps
     927             : #define and_ps _mm256_and_ps
     928             : #define max_ps _mm256_max_ps
     929             : #define shuffle_ps _mm256_shuffle_ps
     930             : #define div_ps _mm256_div_ps
     931             : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
     932             : #define mul_ps _mm256_mul_ps
     933             : #define add_ps _mm256_add_ps
     934             : #define hadd_ps _mm256_hadd_ps
     935             : #define sqrt_ps _mm256_sqrt_ps
     936             : #define or_ps _mm256_or_ps
     937             : #define unpacklo_ps _mm256_unpacklo_ps
     938             : #define unpackhi_ps _mm256_unpackhi_ps
     939             : #define storeu_ps _mm256_storeu_ps
     940             : 
     941             : inline __m256 SQUARE(__m256 x)
     942             : {
     943             :     return _mm256_mul_ps(x, x);
     944             : }
     945             : 
     946             : #else
     947             : 
     948             : #ifdef __SSE3__
     949             : #define sse2_hadd_ps _mm_hadd_ps
     950             : #else
     951             : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
     952             : {
     953             :     auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
     954             :     auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
     955             :     return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
     956             : }
     957             : #endif
     958             : 
     959             : #define RMS_FLOAT_ELTS 4
     960             : #define set1_ps _mm_set1_ps
     961             : #define loadu_ps _mm_loadu_ps
     962             : #define andnot_ps _mm_andnot_ps
     963             : #define and_ps _mm_and_ps
     964             : #define max_ps _mm_max_ps
     965             : #define shuffle_ps _mm_shuffle_ps
     966             : #define div_ps _mm_div_ps
     967             : #define cmpeq_ps _mm_cmpeq_ps
     968             : #define mul_ps _mm_mul_ps
     969             : #define add_ps _mm_add_ps
     970             : #define hadd_ps sse2_hadd_ps
     971             : #define sqrt_ps _mm_sqrt_ps
     972             : #define or_ps _mm_or_ps
     973             : #define unpacklo_ps _mm_unpacklo_ps
     974             : #define unpackhi_ps _mm_unpackhi_ps
     975             : #define storeu_ps _mm_storeu_ps
     976             : 
     977         272 : inline __m128 SQUARE(__m128 x)
     978             : {
     979         272 :     return _mm_mul_ps(x, x);
     980             : }
     981             : 
     982          68 : inline __m128 FIXUP_LANES(__m128 x)
     983             : {
     984          68 :     return x;
     985             : }
     986             : 
     987             : #endif
     988             : 
     989             : template <class T>
     990             : static int NOINLINE
     991          34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
     992             :                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     993             :                        T *CPL_RESTRICT pDstScanline)
     994             : {
     995             :     // Optimized implementation for RMS on Float32 by
     996             :     // processing by group of RMS_FLOAT_ELTS output pixels.
     997          34 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     998             : 
     999          34 :     int iDstPixel = 0;
    1000          34 :     const auto minus_zero = set1_ps(-0.0f);
    1001          34 :     const auto zeroDot25 = set1_ps(0.25f);
    1002          34 :     const auto one = set1_ps(1.0f);
    1003          68 :     const auto infv = set1_ps(std::numeric_limits<float>::infinity());
    1004             : 
    1005         102 :     for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
    1006             :          iDstPixel += RMS_FLOAT_ELTS)
    1007             :     {
    1008             :         // Load 2*RMS_FLOAT_ELTS Float32 from each line
    1009             :         auto firstLineLo =
    1010          68 :             loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
    1011          68 :         auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
    1012          68 :             pSrcScanlineShifted + RMS_FLOAT_ELTS));
    1013          68 :         auto secondLineLo = loadu_ps(
    1014          68 :             reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
    1015          68 :         auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
    1016          68 :             pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
    1017             : 
    1018             :         // Take the absolute value
    1019          68 :         firstLineLo = andnot_ps(minus_zero, firstLineLo);
    1020          68 :         firstLineHi = andnot_ps(minus_zero, firstLineHi);
    1021          68 :         secondLineLo = andnot_ps(minus_zero, secondLineLo);
    1022          68 :         secondLineHi = andnot_ps(minus_zero, secondLineHi);
    1023             : 
    1024             :         auto firstLineEven =
    1025          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1026             :         auto firstLineOdd =
    1027          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1028             :         auto secondLineEven =
    1029          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1030             :         auto secondLineOdd =
    1031          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1032             : 
    1033             :         // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
    1034         204 :         const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
    1035             :                                  max_ps(secondLineEven, secondLineEven));
    1036             : 
    1037             :         // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
    1038             :         // This step is important to avoid that the square evaluates to infinity
    1039             :         // for sufficiently big input.
    1040          68 :         auto invMax = div_ps(one, maxV);
    1041             :         // Deal with 0 being the maximum to correct division by zero
    1042             :         // note: comparing to -0 leads to identical results as to comparing with
    1043             :         // 0
    1044         136 :         invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
    1045             : 
    1046          68 :         firstLineEven = mul_ps(firstLineEven, invMax);
    1047          68 :         firstLineOdd = mul_ps(firstLineOdd, invMax);
    1048          68 :         secondLineEven = mul_ps(secondLineEven, invMax);
    1049          68 :         secondLineOdd = mul_ps(secondLineOdd, invMax);
    1050             : 
    1051             :         // Compute squares
    1052          68 :         firstLineEven = SQUARE(firstLineEven);
    1053          68 :         firstLineOdd = SQUARE(firstLineOdd);
    1054          68 :         secondLineEven = SQUARE(secondLineEven);
    1055          68 :         secondLineOdd = SQUARE(secondLineOdd);
    1056             : 
    1057         204 :         const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
    1058             :                                        add_ps(secondLineEven, secondLineOdd));
    1059             : 
    1060         204 :         auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
    1061             : 
    1062             :         // Deal with infinity being the maximum
    1063          68 :         const auto maskIsInf = cmpeq_ps(maxV, infv);
    1064         136 :         rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
    1065             : 
    1066          68 :         rms = FIXUP_LANES(rms);
    1067             : 
    1068             :         // coverity[incompatible_cast]
    1069          68 :         storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
    1070          68 :         pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
    1071             :     }
    1072             : 
    1073             :     zeroupper();
    1074             : 
    1075          34 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1076          34 :     return iDstPixel;
    1077             : }
    1078             : 
    1079             : /************************************************************************/
    1080             : /*                        AverageFloatSSE2()                            */
    1081             : /************************************************************************/
    1082             : 
    1083             : template <class T>
    1084          14 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
    1085             :                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1086             :                             T *CPL_RESTRICT pDstScanline)
    1087             : {
    1088             :     // Optimized implementation for average on Float32 by
    1089             :     // processing by group of 4 output pixels.
    1090          14 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1091             : 
    1092          14 :     int iDstPixel = 0;
    1093          14 :     const auto zeroDot25 = _mm_set1_ps(0.25f);
    1094             : 
    1095          32 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
    1096             :     {
    1097             :         // Load 8 Float32 from each line
    1098             :         const auto firstLineLo =
    1099          18 :             _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
    1100          18 :         const auto firstLineHi = _mm_loadu_ps(
    1101          18 :             reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
    1102          18 :         const auto secondLineLo = _mm_loadu_ps(
    1103          18 :             reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
    1104          18 :         const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
    1105          18 :             pSrcScanlineShifted + 4 + nChunkXSize));
    1106             : 
    1107             :         // Vertical addition
    1108          18 :         const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
    1109          18 :         const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
    1110             : 
    1111             :         // Horizontal addition
    1112             :         const auto A =
    1113          18 :             _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
    1114             :         const auto B =
    1115          18 :             _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
    1116          18 :         const auto sum = _mm_add_ps(A, B);
    1117             : 
    1118          18 :         const auto average = _mm_mul_ps(sum, zeroDot25);
    1119             : 
    1120             :         // coverity[incompatible_cast]
    1121          18 :         _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
    1122             :                       average);
    1123          18 :         pSrcScanlineShifted += 8;
    1124             :     }
    1125             : 
    1126          14 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1127          14 :     return iDstPixel;
    1128             : }
    1129             : 
    1130             : #endif
    1131             : 
    1132             : /************************************************************************/
    1133             : /*                    GDALResampleChunk_AverageOrRMS()                  */
    1134             : /************************************************************************/
    1135             : 
    1136             : template <class T, class Tsum, GDALDataType eWrkDataType>
    1137             : static CPLErr
    1138       10400 : GDALResampleChunk_AverageOrRMS_T(const GDALOverviewResampleArgs &args,
    1139             :                                  const T *pChunk, void **ppDstBuffer)
    1140             : {
    1141       10400 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1142       10400 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1143       10400 :     const double dfSrcXDelta = args.dfSrcXDelta;
    1144       10400 :     const double dfSrcYDelta = args.dfSrcYDelta;
    1145       10400 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1146       10400 :     const int nChunkXOff = args.nChunkXOff;
    1147       10400 :     const int nChunkYOff = args.nChunkYOff;
    1148       10400 :     const int nChunkXSize = args.nChunkXSize;
    1149       10400 :     const int nChunkYSize = args.nChunkYSize;
    1150       10400 :     const int nDstXOff = args.nDstXOff;
    1151       10400 :     const int nDstXOff2 = args.nDstXOff2;
    1152       10400 :     const int nDstYOff = args.nDstYOff;
    1153       10400 :     const int nDstYOff2 = args.nDstYOff2;
    1154       10400 :     const char *pszResampling = args.pszResampling;
    1155       10400 :     bool bHasNoData = args.bHasNoData;
    1156       10400 :     const double dfNoDataValue = args.dfNoDataValue;
    1157       10400 :     const GDALColorTable *poColorTable = args.poColorTable;
    1158       10400 :     const bool bPropagateNoData = args.bPropagateNoData;
    1159             : 
    1160             :     // AVERAGE_BIT2GRAYSCALE
    1161             :     const bool bBit2Grayscale =
    1162       10400 :         CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
    1163       10401 :     const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
    1164       10401 :     if (bBit2Grayscale)
    1165           9 :         poColorTable = nullptr;
    1166             : 
    1167             :     T tNoDataValue;
    1168       10401 :     if (!bHasNoData)
    1169       10350 :         tNoDataValue = 0;
    1170             :     else
    1171          51 :         tNoDataValue = static_cast<T>(dfNoDataValue);
    1172       10401 :     const T tReplacementVal =
    1173         107 :         bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
    1174          51 :                          args.eOvrDataType, dfNoDataValue))
    1175             :                    : 0;
    1176             : 
    1177       10401 :     int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1178       10401 :     int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1179       10401 :     int nDstXWidth = nDstXOff2 - nDstXOff;
    1180             : 
    1181             :     /* -------------------------------------------------------------------- */
    1182             :     /*      Allocate buffers.                                               */
    1183             :     /* -------------------------------------------------------------------- */
    1184       10401 :     *ppDstBuffer = static_cast<T *>(
    1185       10401 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
    1186             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
    1187       10401 :     if (*ppDstBuffer == nullptr)
    1188             :     {
    1189           0 :         return CE_Failure;
    1190             :     }
    1191       10401 :     T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
    1192             : 
    1193             :     struct PrecomputedXValue
    1194             :     {
    1195             :         int nLeftXOffShifted;
    1196             :         int nRightXOffShifted;
    1197             :         double dfLeftWeight;
    1198             :         double dfRightWeight;
    1199             :         double dfTotalWeightFullLine;
    1200             :     };
    1201             : 
    1202             :     PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
    1203       10401 :         VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
    1204             : 
    1205       10401 :     if (pasSrcX == nullptr)
    1206             :     {
    1207           0 :         VSIFree(pasSrcX);
    1208           0 :         return CE_Failure;
    1209             :     }
    1210             : 
    1211       10401 :     int nTransparentIdx = -1;
    1212       10401 :     std::vector<GDALColorEntry> colorEntries;
    1213       10401 :     if (poColorTable)
    1214           5 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1215             : 
    1216             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1217             :     // it as nodata value
    1218       10428 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1219          27 :         tNoDataValue < colorEntries.size())
    1220           1 :         colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
    1221             : 
    1222             :     // Or if we have no explicit nodata, but a color table entry that is
    1223             :     // transparent, consider it as the nodata value
    1224       10400 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1225             :     {
    1226           0 :         bHasNoData = true;
    1227           0 :         tNoDataValue = static_cast<T>(nTransparentIdx);
    1228             :     }
    1229             : 
    1230             :     /* ==================================================================== */
    1231             :     /*      Precompute inner loop constants.                                */
    1232             :     /* ==================================================================== */
    1233       10401 :     bool bSrcXSpacingIsTwo = true;
    1234       10401 :     int nLastSrcXOff2 = -1;
    1235      867114 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1236             :     {
    1237      856713 :         double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    1238             :         // Apply some epsilon to avoid numerical precision issues
    1239      856713 :         int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    1240      856713 :         double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    1241      856713 :         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    1242             : 
    1243      856713 :         if (nSrcXOff < nChunkXOff)
    1244           0 :             nSrcXOff = nChunkXOff;
    1245      856713 :         if (nSrcXOff2 == nSrcXOff)
    1246           0 :             nSrcXOff2++;
    1247      856713 :         if (nSrcXOff2 > nChunkRightXOff)
    1248           1 :             nSrcXOff2 = nChunkRightXOff;
    1249             : 
    1250      856713 :         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
    1251      856713 :         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
    1252      856713 :             nSrcXOff2 - nChunkXOff;
    1253          21 :         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
    1254      856713 :             (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
    1255      856713 :         pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
    1256      856713 :             1 - (nSrcXOff2 - dfSrcXOff2);
    1257      856713 :         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
    1258      856713 :             pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
    1259      856713 :         if (nSrcXOff + 1 < nSrcXOff2)
    1260             :         {
    1261      856692 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1262      856692 :                 nSrcXOff2 - nSrcXOff - 2;
    1263      856692 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1264      856692 :                 pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
    1265             :         }
    1266             : 
    1267      856713 :         if (nSrcXOff2 - nSrcXOff != 2 ||
    1268      727221 :             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
    1269             :         {
    1270      120592 :             bSrcXSpacingIsTwo = false;
    1271             :         }
    1272      856713 :         nLastSrcXOff2 = nSrcXOff2;
    1273             :     }
    1274             : 
    1275             :     /* ==================================================================== */
    1276             :     /*      Loop over destination scanlines.                                */
    1277             :     /* ==================================================================== */
    1278      752881 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1279             :     {
    1280      742480 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    1281      742480 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    1282      742480 :         if (nSrcYOff < nChunkYOff)
    1283           0 :             nSrcYOff = nChunkYOff;
    1284             : 
    1285      742480 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    1286      742480 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    1287      742480 :         if (nSrcYOff2 == nSrcYOff)
    1288           0 :             ++nSrcYOff2;
    1289      742480 :         if (nSrcYOff2 > nChunkBottomYOff)
    1290           3 :             nSrcYOff2 = nChunkBottomYOff;
    1291             : 
    1292      742480 :         T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1293             : 
    1294             :         /* --------------------------------------------------------------------
    1295             :          */
    1296             :         /*      Loop over destination pixels */
    1297             :         /* --------------------------------------------------------------------
    1298             :          */
    1299      742480 :         if (poColorTable == nullptr)
    1300             :         {
    1301      742365 :             if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
    1302             :                 pabyChunkNodataMask == nullptr)
    1303             :             {
    1304             :                 if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
    1305             :                 {
    1306             :                     // Optimized case : no nodata, overview by a factor of 2 and
    1307             :                     // regular x and y src spacing.
    1308      116440 :                     const T *pSrcScanlineShifted =
    1309      116440 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1310      116440 :                         static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
    1311      116440 :                             nChunkXSize;
    1312      116440 :                     int iDstPixel = 0;
    1313             : #ifdef USE_SSE2
    1314      116421 :                     if (bQuadraticMean && eWrkDataType == GDT_Byte)
    1315             :                     {
    1316        5385 :                         iDstPixel = QuadraticMeanByteSSE2OrAVX2(
    1317             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1318             :                             pDstScanline);
    1319             :                     }
    1320      111055 :                     else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
    1321             :                     {
    1322          10 :                         iDstPixel = QuadraticMeanUInt16SSE2(
    1323             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1324             :                             pDstScanline);
    1325             :                     }
    1326             :                     else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
    1327             :                     {
    1328      111036 :                         iDstPixel = AverageByteSSE2OrAVX2(
    1329             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1330             :                             pDstScanline);
    1331             :                     }
    1332             :                     else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
    1333             :                           */
    1334             :                     {
    1335           9 :                         iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
    1336             :                                                       pSrcScanlineShifted,
    1337             :                                                       pDstScanline);
    1338             :                     }
    1339             : #endif
    1340      278841 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1341             :                     {
    1342      162401 :                         Tsum nTotal = 0;
    1343             :                         T nVal;
    1344      162401 :                         if (bQuadraticMean)
    1345          44 :                             nTotal =
    1346          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[0]) +
    1347          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[1]) +
    1348          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
    1349          44 :                                 SQUARE<Tsum>(
    1350          44 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1351             :                         else
    1352      162357 :                             nTotal = pSrcScanlineShifted[0] +
    1353      162357 :                                      pSrcScanlineShifted[1] +
    1354      162357 :                                      pSrcScanlineShifted[nChunkXSize] +
    1355      162357 :                                      pSrcScanlineShifted[1 + nChunkXSize];
    1356             : 
    1357      162401 :                         constexpr int nTotalWeight = 4;
    1358      162401 :                         if (bQuadraticMean)
    1359          44 :                             nVal = ComputeIntegerRMS_4values<T>(nTotal);
    1360             :                         else
    1361      162357 :                             nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
    1362             :                                                   nTotalWeight);
    1363             : 
    1364             :                         // No need to compare nVal against tNoDataValue as we
    1365             :                         // are in a case where pabyChunkNodataMask == nullptr
    1366             :                         // implies the absence of nodata value.
    1367      162401 :                         pDstScanline[iDstPixel] = nVal;
    1368      162401 :                         pSrcScanlineShifted += 2;
    1369             :                     }
    1370             :                 }
    1371             :                 else
    1372             :                 {
    1373             :                     CPLAssert(eWrkDataType == GDT_Float32 ||
    1374             :                               eWrkDataType == GDT_Float64);
    1375          70 :                     const T *pSrcScanlineShifted =
    1376          70 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1377          70 :                         static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
    1378          70 :                             nChunkXSize;
    1379          70 :                     int iDstPixel = 0;
    1380             : #ifdef USE_SSE2
    1381             :                     if (eWrkDataType == GDT_Float32)
    1382             :                     {
    1383          48 :                         if (bQuadraticMean)
    1384             :                         {
    1385          34 :                             iDstPixel = QuadraticMeanFloatSSE2(
    1386             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1387             :                                 pDstScanline);
    1388             :                         }
    1389             :                         else
    1390             :                         {
    1391          14 :                             iDstPixel = AverageFloatSSE2(
    1392             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1393             :                                 pDstScanline);
    1394             :                         }
    1395             :                     }
    1396             : #endif
    1397             : 
    1398         268 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1399             :                     {
    1400             :                         T nVal;
    1401         198 :                         if (bQuadraticMean)
    1402             :                         {
    1403             :                             // Cast to double to avoid overflows
    1404             :                             // (using std::hypot() is much slower)
    1405         100 :                             nVal = static_cast<T>(std::sqrt(
    1406             :                                 0.25 *
    1407         100 :                                 (SQUARE<double>(pSrcScanlineShifted[0]) +
    1408         100 :                                  SQUARE<double>(pSrcScanlineShifted[1]) +
    1409         100 :                                  SQUARE<double>(
    1410         200 :                                      pSrcScanlineShifted[nChunkXSize]) +
    1411         100 :                                  SQUARE<double>(
    1412         100 :                                      pSrcScanlineShifted[1 + nChunkXSize]))));
    1413             :                         }
    1414             :                         else
    1415             :                         {
    1416          98 :                             nVal = static_cast<T>(
    1417          98 :                                 0.25f * (pSrcScanlineShifted[0] +
    1418          98 :                                          pSrcScanlineShifted[1] +
    1419          98 :                                          pSrcScanlineShifted[nChunkXSize] +
    1420          98 :                                          pSrcScanlineShifted[1 + nChunkXSize]));
    1421             :                         }
    1422             : 
    1423             :                         // No need to compare nVal against tNoDataValue as we
    1424             :                         // are in a case where pabyChunkNodataMask == nullptr
    1425             :                         // implies the absence of nodata value.
    1426         198 :                         pDstScanline[iDstPixel] = nVal;
    1427         198 :                         pSrcScanlineShifted += 2;
    1428             :                     }
    1429      116510 :                 }
    1430             :             }
    1431             :             else
    1432             :             {
    1433          19 :                 const double dfBottomWeight =
    1434      625855 :                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0
    1435      625836 :                                                 : 1.0 - (dfSrcYOff - nSrcYOff);
    1436      625855 :                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
    1437      625855 :                 nSrcYOff -= nChunkYOff;
    1438      625855 :                 nSrcYOff2 -= nChunkYOff;
    1439             : 
    1440      625855 :                 double dfTotalWeightFullColumn = dfBottomWeight;
    1441      625855 :                 if (nSrcYOff + 1 < nSrcYOff2)
    1442             :                 {
    1443      625836 :                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
    1444      625836 :                     dfTotalWeightFullColumn += dfTopWeight;
    1445             :                 }
    1446             : 
    1447    18585856 :                 for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1448             :                 {
    1449    17959981 :                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1450    17959981 :                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1451             : 
    1452    17959981 :                     double dfTotal = 0;
    1453    17959981 :                     double dfTotalWeight = 0;
    1454    17959981 :                     if (pabyChunkNodataMask == nullptr)
    1455             :                     {
    1456     1746435 :                         auto pChunkShifted =
    1457         115 :                             pChunk +
    1458     1746435 :                             static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
    1459     1746435 :                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
    1460     1746435 :                         double dfWeightY = dfBottomWeight;
    1461     3493427 :                         while (true)
    1462             :                         {
    1463             :                             double dfTotalLine;
    1464     5239852 :                             if (bQuadraticMean)
    1465             :                             {
    1466             :                                 // Left pixel
    1467             :                                 {
    1468         104 :                                     const T val = pChunkShifted[nSrcXOff];
    1469         104 :                                     dfTotalLine =
    1470         104 :                                         SQUARE<double>(val) *
    1471         104 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1472             :                                 }
    1473             : 
    1474         104 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1475             :                                 {
    1476             :                                     // Middle pixels
    1477         104 :                                     for (int iX = nSrcXOff + 1;
    1478         424 :                                          iX + 1 < nSrcXOff2; ++iX)
    1479             :                                     {
    1480         320 :                                         const T val = pChunkShifted[iX];
    1481         320 :                                         dfTotalLine += SQUARE<double>(val);
    1482             :                                     }
    1483             : 
    1484             :                                     // Right pixel
    1485             :                                     {
    1486         104 :                                         const T val =
    1487         104 :                                             pChunkShifted[nSrcXOff2 - 1];
    1488         104 :                                         dfTotalLine +=
    1489         104 :                                             SQUARE<double>(val) *
    1490         104 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1491             :                                     }
    1492             :                                 }
    1493             :                             }
    1494             :                             else
    1495             :                             {
    1496             :                                 // Left pixel
    1497             :                                 {
    1498     5239756 :                                     const T val = pChunkShifted[nSrcXOff];
    1499     5239756 :                                     dfTotalLine =
    1500     5239756 :                                         val * pasSrcX[iDstPixel].dfLeftWeight;
    1501             :                                 }
    1502             : 
    1503     5239756 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1504             :                                 {
    1505             :                                     // Middle pixels
    1506     4239330 :                                     for (int iX = nSrcXOff + 1;
    1507    64183126 :                                          iX + 1 < nSrcXOff2; ++iX)
    1508             :                                     {
    1509    59943836 :                                         const T val = pChunkShifted[iX];
    1510    59943836 :                                         dfTotalLine += val;
    1511             :                                     }
    1512             : 
    1513             :                                     // Right pixel
    1514             :                                     {
    1515     4239330 :                                         const T val =
    1516     4239330 :                                             pChunkShifted[nSrcXOff2 - 1];
    1517     4239330 :                                         dfTotalLine +=
    1518     4239330 :                                             val *
    1519     4239330 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1520             :                                     }
    1521             :                                 }
    1522             :                             }
    1523             : 
    1524     5239852 :                             dfTotal += dfTotalLine * dfWeightY;
    1525     5239852 :                             --nCounterY;
    1526     5239852 :                             if (nCounterY < 0)
    1527     1746435 :                                 break;
    1528     3493427 :                             pChunkShifted += nChunkXSize;
    1529     3493427 :                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
    1530             :                         }
    1531             : 
    1532     1746435 :                         dfTotalWeight =
    1533     1746435 :                             pasSrcX[iDstPixel].dfTotalWeightFullLine *
    1534             :                             dfTotalWeightFullColumn;
    1535             :                     }
    1536             :                     else
    1537             :                     {
    1538    16213566 :                         GPtrDiff_t nCount = 0;
    1539    71190898 :                         for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1540             :                         {
    1541    54977432 :                             const auto pChunkShifted =
    1542         132 :                                 pChunk +
    1543    54977432 :                                 static_cast<GPtrDiff_t>(iY) * nChunkXSize;
    1544             : 
    1545    54977432 :                             double dfTotalLine = 0;
    1546    54977432 :                             double dfTotalWeightLine = 0;
    1547             :                             // Left pixel
    1548             :                             {
    1549    54977432 :                                 const int iX = nSrcXOff;
    1550    54977432 :                                 const T val = pChunkShifted[iX];
    1551    54977432 :                                 if (pabyChunkNodataMask[iX + iY * nChunkXSize])
    1552             :                                 {
    1553    23420081 :                                     nCount++;
    1554    23420081 :                                     const double dfWeightX =
    1555    23420081 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1556    23420081 :                                     dfTotalWeightLine = dfWeightX;
    1557    23420081 :                                     if (bQuadraticMean)
    1558          60 :                                         dfTotalLine =
    1559          60 :                                             SQUARE<double>(val) * dfWeightX;
    1560             :                                     else
    1561    23419981 :                                         dfTotalLine = val * dfWeightX;
    1562             :                                 }
    1563             :                             }
    1564             : 
    1565    54977432 :                             if (nSrcXOff + 1 < nSrcXOff2)
    1566             :                             {
    1567             :                                 // Middle pixels
    1568   145172132 :                                 for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
    1569             :                                      ++iX)
    1570             :                                 {
    1571    90195000 :                                     const T val = pChunkShifted[iX];
    1572    90195000 :                                     if (pabyChunkNodataMask[iX +
    1573    90195000 :                                                             iY * nChunkXSize])
    1574             :                                     {
    1575    39728200 :                                         nCount++;
    1576    39728200 :                                         dfTotalWeightLine += 1;
    1577    39728200 :                                         if (bQuadraticMean)
    1578           0 :                                             dfTotalLine += SQUARE<double>(val);
    1579             :                                         else
    1580    39728200 :                                             dfTotalLine += val;
    1581             :                                     }
    1582             :                                 }
    1583             : 
    1584             :                                 // Right pixel
    1585             :                                 {
    1586    54977432 :                                     const int iX = nSrcXOff2 - 1;
    1587    54977432 :                                     const T val = pChunkShifted[iX];
    1588    54977432 :                                     if (pabyChunkNodataMask[iX +
    1589    54977432 :                                                             iY * nChunkXSize])
    1590             :                                     {
    1591    23419247 :                                         nCount++;
    1592    23419247 :                                         const double dfWeightX =
    1593    23419247 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1594    23419247 :                                         dfTotalWeightLine += dfWeightX;
    1595    23419247 :                                         if (bQuadraticMean)
    1596          65 :                                             dfTotalLine +=
    1597          61 :                                                 SQUARE<double>(val) * dfWeightX;
    1598             :                                         else
    1599    23419246 :                                             dfTotalLine += val * dfWeightX;
    1600             :                                     }
    1601             :                                 }
    1602             :                             }
    1603             : 
    1604    93741198 :                             const double dfWeightY =
    1605             :                                 (iY == nSrcYOff)        ? dfBottomWeight
    1606    38763866 :                                 : (iY + 1 == nSrcYOff2) ? dfTopWeight
    1607             :                                                         : 1.0;
    1608    54977432 :                             dfTotal += dfTotalLine * dfWeightY;
    1609    54977432 :                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
    1610             :                         }
    1611             : 
    1612    16213566 :                         if (nCount == 0 ||
    1613           8 :                             (bPropagateNoData &&
    1614             :                              nCount <
    1615           8 :                                  static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
    1616           8 :                                      (nSrcXOff2 - nSrcXOff)))
    1617             :                         {
    1618     9461842 :                             pDstScanline[iDstPixel] = tNoDataValue;
    1619     9461842 :                             continue;
    1620             :                         }
    1621             :                     }
    1622             :                     if (eWrkDataType == GDT_Byte)
    1623             :                     {
    1624             :                         T nVal;
    1625     8497990 :                         if (bQuadraticMean)
    1626          38 :                             nVal = ComputeIntegerRMS<T, int>(dfTotal,
    1627             :                                                              dfTotalWeight);
    1628             :                         else
    1629     8497950 :                             nVal =
    1630     8497950 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1631     8497990 :                         if (bHasNoData && nVal == tNoDataValue)
    1632           0 :                             nVal = tReplacementVal;
    1633     8497990 :                         pDstScanline[iDstPixel] = nVal;
    1634             :                     }
    1635             :                     else if (eWrkDataType == GDT_UInt16)
    1636             :                     {
    1637             :                         T nVal;
    1638           8 :                         if (bQuadraticMean)
    1639           4 :                             nVal = ComputeIntegerRMS<T, uint64_t>(
    1640             :                                 dfTotal, dfTotalWeight);
    1641             :                         else
    1642           4 :                             nVal =
    1643           4 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1644           8 :                         if (bHasNoData && nVal == tNoDataValue)
    1645           0 :                             nVal = tReplacementVal;
    1646           8 :                         pDstScanline[iDstPixel] = nVal;
    1647             :                     }
    1648             :                     else
    1649             :                     {
    1650             :                         T nVal;
    1651         151 :                         if (bQuadraticMean)
    1652          20 :                             nVal =
    1653          25 :                                 static_cast<T>(sqrt(dfTotal / dfTotalWeight));
    1654             :                         else
    1655         126 :                             nVal = static_cast<T>(dfTotal / dfTotalWeight);
    1656         151 :                         if (bHasNoData && nVal == tNoDataValue)
    1657           2 :                             nVal = tReplacementVal;
    1658         151 :                         pDstScanline[iDstPixel] = nVal;
    1659             :                     }
    1660             :                 }
    1661             :             }
    1662             :         }
    1663             :         else
    1664             :         {
    1665         115 :             nSrcYOff -= nChunkYOff;
    1666         115 :             nSrcYOff2 -= nChunkYOff;
    1667             : 
    1668        6589 :             for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1669             :             {
    1670        6475 :                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1671        6475 :                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1672             : 
    1673        6475 :                 GPtrDiff_t nTotalR = 0;
    1674        6475 :                 GPtrDiff_t nTotalG = 0;
    1675        6475 :                 GPtrDiff_t nTotalB = 0;
    1676        6475 :                 GPtrDiff_t nCount = 0;
    1677             : 
    1678       19425 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1679             :                 {
    1680       38850 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    1681             :                     {
    1682       25900 :                         const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
    1683       25900 :                                                       nChunkXSize];
    1684             :                         // cppcheck-suppress unsignedLessThanZero
    1685       25900 :                         if (val < 0 || val >= colorEntries.size())
    1686           0 :                             continue;
    1687       25900 :                         size_t idx = static_cast<size_t>(val);
    1688       25900 :                         const auto &entry = colorEntries[idx];
    1689       25900 :                         if (entry.c4)
    1690             :                         {
    1691       14128 :                             if (bQuadraticMean)
    1692             :                             {
    1693         800 :                                 nTotalR += SQUARE<int>(entry.c1);
    1694         800 :                                 nTotalG += SQUARE<int>(entry.c2);
    1695         800 :                                 nTotalB += SQUARE<int>(entry.c3);
    1696         800 :                                 ++nCount;
    1697             :                             }
    1698             :                             else
    1699             :                             {
    1700       13328 :                                 nTotalR += entry.c1;
    1701       13328 :                                 nTotalG += entry.c2;
    1702       13328 :                                 nTotalB += entry.c3;
    1703       13328 :                                 ++nCount;
    1704             :                             }
    1705             :                         }
    1706             :                     }
    1707             :                 }
    1708             : 
    1709        6475 :                 if (nCount == 0 ||
    1710           0 :                     (bPropagateNoData &&
    1711           0 :                      nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
    1712           0 :                                   (nSrcXOff2 - nSrcXOff)))
    1713             :                 {
    1714        2838 :                     pDstScanline[iDstPixel] = tNoDataValue;
    1715             :                 }
    1716             :                 else
    1717             :                 {
    1718             :                     GDALColorEntry color;
    1719        3637 :                     if (bQuadraticMean)
    1720             :                     {
    1721         200 :                         color.c1 =
    1722         200 :                             static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
    1723         200 :                         color.c2 =
    1724         200 :                             static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
    1725         200 :                         color.c3 =
    1726         200 :                             static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
    1727             :                     }
    1728             :                     else
    1729             :                     {
    1730        3437 :                         color.c1 =
    1731        3437 :                             static_cast<short>((nTotalR + nCount / 2) / nCount);
    1732        3437 :                         color.c2 =
    1733        3437 :                             static_cast<short>((nTotalG + nCount / 2) / nCount);
    1734        3437 :                         color.c3 =
    1735        3437 :                             static_cast<short>((nTotalB + nCount / 2) / nCount);
    1736             :                     }
    1737        3636 :                     pDstScanline[iDstPixel] =
    1738        3637 :                         static_cast<T>(BestColorEntry(colorEntries, color));
    1739             :                 }
    1740             :             }
    1741             :         }
    1742             :     }
    1743             : 
    1744       10401 :     CPLFree(pasSrcX);
    1745             : 
    1746       10401 :     return CE_None;
    1747             : }
    1748             : 
    1749             : static CPLErr
    1750       10401 : GDALResampleChunk_AverageOrRMS(const GDALOverviewResampleArgs &args,
    1751             :                                const void *pChunk, void **ppDstBuffer,
    1752             :                                GDALDataType *peDstBufferDataType)
    1753             : {
    1754       10401 :     *peDstBufferDataType = args.eWrkDataType;
    1755       10401 :     switch (args.eWrkDataType)
    1756             :     {
    1757       10336 :         case GDT_Byte:
    1758             :         {
    1759       10336 :             return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
    1760       10335 :                 args, static_cast<const GByte *>(pChunk), ppDstBuffer);
    1761             :         }
    1762             : 
    1763           9 :         case GDT_UInt16:
    1764             :         {
    1765           9 :             if (EQUAL(args.pszResampling, "RMS"))
    1766             :             {
    1767             :                 // Use double as accumulation type, because UInt32 could overflow
    1768             :                 return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
    1769           5 :                                                         GDT_UInt16>(
    1770           5 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1771             :             }
    1772             :             else
    1773             :             {
    1774             :                 return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
    1775           4 :                                                         GDT_UInt16>(
    1776           4 :                     args, static_cast<const GUInt16 *>(pChunk), ppDstBuffer);
    1777             :             }
    1778             :         }
    1779             : 
    1780          39 :         case GDT_Float32:
    1781             :         {
    1782          39 :             return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
    1783          39 :                 args, static_cast<const float *>(pChunk), ppDstBuffer);
    1784             :         }
    1785             : 
    1786          17 :         case GDT_Float64:
    1787             :         {
    1788             :             return GDALResampleChunk_AverageOrRMS_T<double, double,
    1789          17 :                                                     GDT_Float64>(
    1790          17 :                 args, static_cast<const double *>(pChunk), ppDstBuffer);
    1791             :         }
    1792             : 
    1793           0 :         default:
    1794           0 :             break;
    1795             :     }
    1796             : 
    1797           0 :     CPLAssert(false);
    1798             :     return CE_Failure;
    1799             : }
    1800             : 
    1801             : /************************************************************************/
    1802             : /*                     GDALResampleChunk_Gauss()                        */
    1803             : /************************************************************************/
    1804             : 
    1805          86 : static CPLErr GDALResampleChunk_Gauss(const GDALOverviewResampleArgs &args,
    1806             :                                       const void *pChunk, void **ppDstBuffer,
    1807             :                                       GDALDataType *peDstBufferDataType)
    1808             : 
    1809             : {
    1810          86 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    1811          86 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    1812          86 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    1813          86 :     const int nChunkXOff = args.nChunkXOff;
    1814          86 :     const int nChunkXSize = args.nChunkXSize;
    1815          86 :     const int nChunkYOff = args.nChunkYOff;
    1816          86 :     const int nChunkYSize = args.nChunkYSize;
    1817          86 :     const int nDstXOff = args.nDstXOff;
    1818          86 :     const int nDstXOff2 = args.nDstXOff2;
    1819          86 :     const int nDstYOff = args.nDstYOff;
    1820          86 :     const int nDstYOff2 = args.nDstYOff2;
    1821          86 :     const bool bHasNoData = args.bHasNoData;
    1822          86 :     double dfNoDataValue = args.dfNoDataValue;
    1823          86 :     const GDALColorTable *poColorTable = args.poColorTable;
    1824             : 
    1825          86 :     const double *const padfChunk = static_cast<const double *>(pChunk);
    1826             : 
    1827          86 :     *ppDstBuffer =
    1828          86 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    1829             :                             GDALGetDataTypeSizeBytes(GDT_Float64));
    1830          86 :     if (*ppDstBuffer == nullptr)
    1831             :     {
    1832           0 :         return CE_Failure;
    1833             :     }
    1834          86 :     *peDstBufferDataType = GDT_Float64;
    1835          86 :     double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
    1836             : 
    1837             :     /* -------------------------------------------------------------------- */
    1838             :     /*      Create the filter kernel and allocate scanline buffer.          */
    1839             :     /* -------------------------------------------------------------------- */
    1840          86 :     int nGaussMatrixDim = 3;
    1841             :     const int *panGaussMatrix;
    1842          86 :     constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
    1843          86 :     constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
    1844             :                                         4,  6, 24, 36, 24, 6, 4,  16, 24,
    1845             :                                         16, 4, 1,  4,  6,  4, 1};
    1846          86 :     constexpr int anGaussMatrix7x7[] = {
    1847             :         1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
    1848             :         6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
    1849             :         120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
    1850             :         90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
    1851             : 
    1852          86 :     const int nOXSize = args.nOvrXSize;
    1853          86 :     const int nOYSize = args.nOvrYSize;
    1854          86 :     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1855             : 
    1856             :     // matrix for gauss filter
    1857          86 :     if (nResYFactor <= 2)
    1858             :     {
    1859          85 :         panGaussMatrix = anGaussMatrix3x3;
    1860          85 :         nGaussMatrixDim = 3;
    1861             :     }
    1862           1 :     else if (nResYFactor <= 4)
    1863             :     {
    1864           0 :         panGaussMatrix = anGaussMatrix5x5;
    1865           0 :         nGaussMatrixDim = 5;
    1866             :     }
    1867             :     else
    1868             :     {
    1869           1 :         panGaussMatrix = anGaussMatrix7x7;
    1870           1 :         nGaussMatrixDim = 7;
    1871             :     }
    1872             : 
    1873             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    1874             :     int *panGaussMatrixDup = static_cast<int *>(
    1875             :         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
    1876             :     memcpy(panGaussMatrixDup, panGaussMatrix,
    1877             :            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
    1878             :     panGaussMatrix = panGaussMatrixDup;
    1879             : #endif
    1880             : 
    1881          86 :     if (!bHasNoData)
    1882          79 :         dfNoDataValue = 0.0;
    1883             : 
    1884          86 :     std::vector<GDALColorEntry> colorEntries;
    1885          86 :     int nTransparentIdx = -1;
    1886          86 :     if (poColorTable)
    1887           2 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1888             : 
    1889             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1890             :     // it as nodata value.
    1891          92 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1892           6 :         dfNoDataValue < colorEntries.size())
    1893           0 :         colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
    1894             : 
    1895             :     // Or if we have no explicit nodata, but a color table entry that is
    1896             :     // transparent, consider it as the nodata value.
    1897          86 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1898             :     {
    1899           0 :         dfNoDataValue = nTransparentIdx;
    1900             :     }
    1901             : 
    1902          86 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1903          86 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1904          86 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    1905             : 
    1906             :     /* ==================================================================== */
    1907             :     /*      Loop over destination scanlines.                                */
    1908             :     /* ==================================================================== */
    1909       16488 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1910             :     {
    1911       16402 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    1912       16402 :         int nSrcYOff2 =
    1913       16402 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
    1914             : 
    1915       16402 :         if (nSrcYOff < nChunkYOff)
    1916             :         {
    1917           0 :             nSrcYOff = nChunkYOff;
    1918           0 :             nSrcYOff2++;
    1919             :         }
    1920             : 
    1921       16402 :         const int iSizeY = nSrcYOff2 - nSrcYOff;
    1922       16402 :         nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
    1923       16402 :         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
    1924             : 
    1925       16402 :         if (nSrcYOff2 > nChunkBottomYOff ||
    1926       16359 :             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
    1927             :         {
    1928          44 :             nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
    1929             :         }
    1930             : 
    1931       16402 :         int nYShiftGaussMatrix = 0;
    1932       16402 :         if (nSrcYOff < nChunkYOff)
    1933             :         {
    1934           0 :             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
    1935           0 :             nSrcYOff = nChunkYOff;
    1936             :         }
    1937             : 
    1938       16402 :         const double *const padfSrcScanline =
    1939       16402 :             padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1940       16402 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    1941       16402 :         if (pabyChunkNodataMask != nullptr)
    1942         152 :             pabySrcScanlineNodataMask =
    1943         152 :                 pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1944             : 
    1945             :         /* --------------------------------------------------------------------
    1946             :          */
    1947             :         /*      Loop over destination pixels */
    1948             :         /* --------------------------------------------------------------------
    1949             :          */
    1950       16402 :         double *const padfDstScanline =
    1951       16402 :             padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1952     4149980 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1953             :         {
    1954     4133580 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    1955     4133580 :             int nSrcXOff2 =
    1956     4133580 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
    1957             : 
    1958     4133580 :             if (nSrcXOff < nChunkXOff)
    1959             :             {
    1960           0 :                 nSrcXOff = nChunkXOff;
    1961           0 :                 nSrcXOff2++;
    1962             :             }
    1963             : 
    1964     4133580 :             const int iSizeX = nSrcXOff2 - nSrcXOff;
    1965     4133580 :             nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
    1966     4133580 :             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
    1967             : 
    1968     4133580 :             if (nSrcXOff2 > nChunkRightXOff ||
    1969     4127930 :                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
    1970             :             {
    1971        5650 :                 nSrcXOff2 =
    1972        5650 :                     std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
    1973             :             }
    1974             : 
    1975     4133580 :             int nXShiftGaussMatrix = 0;
    1976     4133580 :             if (nSrcXOff < nChunkXOff)
    1977             :             {
    1978           0 :                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
    1979           0 :                 nSrcXOff = nChunkXOff;
    1980             :             }
    1981             : 
    1982     4133580 :             if (poColorTable == nullptr)
    1983             :             {
    1984     4133380 :                 double dfTotal = 0.0;
    1985     4133380 :                 GInt64 nCount = 0;
    1986     4133380 :                 const int *panLineWeight =
    1987     4133380 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    1988             :                     nXShiftGaussMatrix;
    1989             : 
    1990    16527900 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    1991    12394500 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    1992             :                 {
    1993    49561300 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    1994             :                     {
    1995    37166800 :                         const double val =
    1996    37166800 :                             padfSrcScanline[iX - nChunkXOff +
    1997    37166800 :                                             static_cast<GPtrDiff_t>(iY -
    1998    37166800 :                                                                     nSrcYOff) *
    1999    37166800 :                                                 nChunkXSize];
    2000    37166800 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2001       32872 :                             pabySrcScanlineNodataMask[iX - nChunkXOff +
    2002       32872 :                                                       static_cast<GPtrDiff_t>(
    2003       32872 :                                                           iY - nSrcYOff) *
    2004       32872 :                                                           nChunkXSize])
    2005             :                         {
    2006    37146100 :                             const int nWeight = panLineWeight[i];
    2007    37146100 :                             dfTotal += val * nWeight;
    2008    37146100 :                             nCount += nWeight;
    2009             :                         }
    2010             :                     }
    2011             :                 }
    2012             : 
    2013     4133380 :                 if (nCount == 0)
    2014             :                 {
    2015        2217 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2016             :                 }
    2017             :                 else
    2018             :                 {
    2019     4131160 :                     padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
    2020             :                 }
    2021             :             }
    2022             :             else
    2023             :             {
    2024         200 :                 GInt64 nTotalR = 0;
    2025         200 :                 GInt64 nTotalG = 0;
    2026         200 :                 GInt64 nTotalB = 0;
    2027         200 :                 GInt64 nTotalWeight = 0;
    2028         200 :                 const int *panLineWeight =
    2029         200 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2030             :                     nXShiftGaussMatrix;
    2031             : 
    2032         780 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    2033         580 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    2034             :                 {
    2035        2262 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2036             :                     {
    2037        1682 :                         const double val =
    2038        1682 :                             padfSrcScanline[iX - nChunkXOff +
    2039        1682 :                                             static_cast<GPtrDiff_t>(iY -
    2040        1682 :                                                                     nSrcYOff) *
    2041        1682 :                                                 nChunkXSize];
    2042        1682 :                         if (val < 0 || val >= colorEntries.size())
    2043           0 :                             continue;
    2044             : 
    2045        1682 :                         size_t idx = static_cast<size_t>(val);
    2046        1682 :                         if (colorEntries[idx].c4)
    2047             :                         {
    2048        1682 :                             const int nWeight = panLineWeight[i];
    2049        1682 :                             nTotalR +=
    2050        1682 :                                 static_cast<GInt64>(colorEntries[idx].c1) *
    2051        1682 :                                 nWeight;
    2052        1682 :                             nTotalG +=
    2053        1682 :                                 static_cast<GInt64>(colorEntries[idx].c2) *
    2054        1682 :                                 nWeight;
    2055        1682 :                             nTotalB +=
    2056        1682 :                                 static_cast<GInt64>(colorEntries[idx].c3) *
    2057        1682 :                                 nWeight;
    2058        1682 :                             nTotalWeight += nWeight;
    2059             :                         }
    2060             :                     }
    2061             :                 }
    2062             : 
    2063         200 :                 if (nTotalWeight == 0)
    2064             :                 {
    2065           0 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2066             :                 }
    2067             :                 else
    2068             :                 {
    2069             :                     GDALColorEntry color;
    2070             : 
    2071         200 :                     color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
    2072             :                                                   nTotalWeight);
    2073         200 :                     color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
    2074             :                                                   nTotalWeight);
    2075         200 :                     color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
    2076             :                                                   nTotalWeight);
    2077         200 :                     padfDstScanline[iDstPixel - nDstXOff] =
    2078         200 :                         BestColorEntry(colorEntries, color);
    2079             :                 }
    2080             :             }
    2081             :         }
    2082             :     }
    2083             : 
    2084             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2085             :     CPLFree(panGaussMatrixDup);
    2086             : #endif
    2087             : 
    2088          86 :     return CE_None;
    2089             : }
    2090             : 
    2091             : /************************************************************************/
    2092             : /*                      GDALResampleChunk_Mode()                        */
    2093             : /************************************************************************/
    2094             : 
    2095        4398 : template <class T> static inline bool IsSame(T a, T b)
    2096             : {
    2097        4398 :     return a == b;
    2098             : }
    2099             : 
    2100        4854 : template <> bool IsSame<float>(float a, float b)
    2101             : {
    2102        4854 :     return a == b || (std::isnan(a) && std::isnan(b));
    2103             : }
    2104             : 
    2105         504 : template <> bool IsSame<double>(double a, double b)
    2106             : {
    2107         504 :     return a == b || (std::isnan(a) && std::isnan(b));
    2108             : }
    2109             : 
    2110             : template <>
    2111         480 : bool IsSame<std::complex<float>>(std::complex<float> a, std::complex<float> b)
    2112             : {
    2113         960 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2114         960 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2115             : }
    2116             : 
    2117             : template <>
    2118         480 : bool IsSame<std::complex<double>>(std::complex<double> a,
    2119             :                                   std::complex<double> b)
    2120             : {
    2121         960 :     return a == b || (std::isnan(a.real()) && std::isnan(a.imag()) &&
    2122         960 :                       std::isnan(b.real()) && std::isnan(b.imag()));
    2123             : }
    2124             : 
    2125             : template <class T>
    2126         136 : static CPLErr GDALResampleChunk_ModeT(const GDALOverviewResampleArgs &args,
    2127             :                                       const T *pChunk, T *const pDstBuffer)
    2128             : 
    2129             : {
    2130         136 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    2131         136 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    2132         136 :     const double dfSrcXDelta = args.dfSrcXDelta;
    2133         136 :     const double dfSrcYDelta = args.dfSrcYDelta;
    2134         136 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    2135         136 :     const int nChunkXOff = args.nChunkXOff;
    2136         136 :     const int nChunkXSize = args.nChunkXSize;
    2137         136 :     const int nChunkYOff = args.nChunkYOff;
    2138         136 :     const int nChunkYSize = args.nChunkYSize;
    2139         136 :     const int nDstXOff = args.nDstXOff;
    2140         136 :     const int nDstXOff2 = args.nDstXOff2;
    2141         136 :     const int nDstYOff = args.nDstYOff;
    2142         136 :     const int nDstYOff2 = args.nDstYOff2;
    2143         136 :     const bool bHasNoData = args.bHasNoData;
    2144         136 :     const GDALColorTable *poColorTable = args.poColorTable;
    2145         136 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    2146             : 
    2147           8 :     T tNoDataValue;
    2148             :     if constexpr (std::is_same<T, std::complex<float>>::value ||
    2149             :                   std::is_same<T, std::complex<double>>::value)
    2150             :     {
    2151             :         using BaseT = typename T::value_type;
    2152           8 :         tNoDataValue =
    2153             :             std::complex<BaseT>(std::numeric_limits<BaseT>::quiet_NaN(),
    2154             :                                 std::numeric_limits<BaseT>::quiet_NaN());
    2155             :     }
    2156         128 :     else if (!bHasNoData || !GDALIsValueInRange<T>(args.dfNoDataValue))
    2157         127 :         tNoDataValue = 0;
    2158             :     else
    2159           1 :         tNoDataValue = static_cast<T>(args.dfNoDataValue);
    2160             : 
    2161         136 :     size_t nMaxNumPx = 0;
    2162         136 :     T *paVals = nullptr;
    2163         136 :     int *panSums = nullptr;
    2164             : 
    2165         136 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2166         136 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2167         272 :     std::vector<int> anVals(256, 0);
    2168             : 
    2169             :     /* ==================================================================== */
    2170             :     /*      Loop over destination scanlines.                                */
    2171             :     /* ==================================================================== */
    2172        7531 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2173             :     {
    2174        7395 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    2175        7395 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    2176             : #ifdef only_pixels_with_more_than_10_pct_participation
    2177             :         // When oversampling, don't take into account pixels that have a tiny
    2178             :         // participation in the resulting pixel
    2179             :         if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
    2180             :             nSrcYOff < nChunkBottomYOff)
    2181             :             nSrcYOff++;
    2182             : #endif
    2183        7395 :         if (nSrcYOff < nChunkYOff)
    2184           0 :             nSrcYOff = nChunkYOff;
    2185             : 
    2186        7395 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    2187        7395 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    2188             : #ifdef only_pixels_with_more_than_10_pct_participation
    2189             :         // When oversampling, don't take into account pixels that have a tiny
    2190             :         // participation in the resulting pixel
    2191             :         if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
    2192             :             nSrcYOff2 > nChunkYOff)
    2193             :             nSrcYOff2--;
    2194             : #endif
    2195        7395 :         if (nSrcYOff2 == nSrcYOff)
    2196           0 :             ++nSrcYOff2;
    2197        7395 :         if (nSrcYOff2 > nChunkBottomYOff)
    2198           0 :             nSrcYOff2 = nChunkBottomYOff;
    2199             : 
    2200        7395 :         const T *const paSrcScanline =
    2201         149 :             pChunk +
    2202        7395 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
    2203        7395 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2204        7395 :         if (pabyChunkNodataMask != nullptr)
    2205        1810 :             pabySrcScanlineNodataMask =
    2206             :                 pabyChunkNodataMask +
    2207        1810 :                 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
    2208             : 
    2209        7395 :         T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
    2210             :         /* --------------------------------------------------------------------
    2211             :          */
    2212             :         /*      Loop over destination pixels */
    2213             :         /* --------------------------------------------------------------------
    2214             :          */
    2215     4259580 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2216             :         {
    2217     4252187 :             double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    2218             :             // Apply some epsilon to avoid numerical precision issues
    2219     4252187 :             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    2220             : #ifdef only_pixels_with_more_than_10_pct_participation
    2221             :             // When oversampling, don't take into account pixels that have a
    2222             :             // tiny participation in the resulting pixel
    2223             :             if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
    2224             :                 nSrcXOff < nChunkRightXOff)
    2225             :                 nSrcXOff++;
    2226             : #endif
    2227     4252187 :             if (nSrcXOff < nChunkXOff)
    2228           0 :                 nSrcXOff = nChunkXOff;
    2229             : 
    2230     4252187 :             double dfSrcXOff2 =
    2231     4252187 :                 dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    2232     4252187 :             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    2233             : #ifdef only_pixels_with_more_than_10_pct_participation
    2234             :             // When oversampling, don't take into account pixels that have a
    2235             :             // tiny participation in the resulting pixel
    2236             :             if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
    2237             :                 nSrcXOff2 > nChunkXOff)
    2238             :                 nSrcXOff2--;
    2239             : #endif
    2240     4252187 :             if (nSrcXOff2 == nSrcXOff)
    2241           0 :                 nSrcXOff2++;
    2242     4252187 :             if (nSrcXOff2 > nChunkRightXOff)
    2243           0 :                 nSrcXOff2 = nChunkRightXOff;
    2244             : 
    2245     4252187 :             bool bRegularProcessing = false;
    2246             :             if constexpr (!std::is_same<T, GByte>::value)
    2247         827 :                 bRegularProcessing = true;
    2248     4251360 :             else if (poColorTable && poColorTable->GetColorEntryCount() > 256)
    2249           0 :                 bRegularProcessing = true;
    2250             : 
    2251     4252187 :             if (bRegularProcessing)
    2252             :             {
    2253             :                 // Not sure how much sense it makes to run a majority
    2254             :                 // filter on floating point data, but here it is for the sake
    2255             :                 // of compatibility. It won't look right on RGB images by the
    2256             :                 // nature of the filter.
    2257             : 
    2258         827 :                 if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
    2259        2481 :                     nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
    2260         827 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2261         827 :                             static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
    2262         827 :                         std::numeric_limits<size_t>::max() / sizeof(float))
    2263             :                 {
    2264           0 :                     CPLError(CE_Failure, CPLE_NotSupported,
    2265             :                              "Too big downsampling factor");
    2266           0 :                     CPLFree(paVals);
    2267           0 :                     CPLFree(panSums);
    2268           0 :                     return CE_Failure;
    2269             :                 }
    2270         827 :                 const size_t nNumPx =
    2271         827 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2272         827 :                     static_cast<size_t>(nSrcXOff2 - nSrcXOff);
    2273         827 :                 size_t iMaxInd = 0;
    2274         827 :                 size_t iMaxVal = 0;
    2275         827 :                 bool biMaxValdValid = false;
    2276             : 
    2277         827 :                 if (paVals == nullptr || nNumPx > nMaxNumPx)
    2278             :                 {
    2279             :                     T *paValsNew = static_cast<T *>(
    2280          71 :                         VSI_REALLOC_VERBOSE(paVals, nNumPx * sizeof(T)));
    2281             :                     int *panSumsNew = static_cast<int *>(
    2282          71 :                         VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
    2283          71 :                     if (paValsNew != nullptr)
    2284          71 :                         paVals = paValsNew;
    2285          71 :                     if (panSumsNew != nullptr)
    2286          71 :                         panSums = panSumsNew;
    2287          71 :                     if (paValsNew == nullptr || panSumsNew == nullptr)
    2288             :                     {
    2289           0 :                         CPLFree(paVals);
    2290           0 :                         CPLFree(panSums);
    2291           0 :                         return CE_Failure;
    2292             :                     }
    2293          71 :                     nMaxNumPx = nNumPx;
    2294             :                 }
    2295             : 
    2296        2585 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2297             :                 {
    2298        1758 :                     const GPtrDiff_t iTotYOff =
    2299        1758 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2300        1758 :                         nChunkXOff;
    2301        5690 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2302             :                     {
    2303        3932 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2304          16 :                             pabySrcScanlineNodataMask[iX + iTotYOff])
    2305             :                         {
    2306        3917 :                             const T val = paSrcScanline[iX + iTotYOff];
    2307        3917 :                             size_t i = 0;  // Used after for.
    2308             : 
    2309             :                             // Check array for existing entry.
    2310       14387 :                             for (; i < iMaxInd; ++i)
    2311       17626 :                                 if (IsSame(paVals[i], val) &&
    2312        6910 :                                     ++panSums[i] > panSums[iMaxVal])
    2313             :                                 {
    2314         246 :                                     iMaxVal = i;
    2315         246 :                                     biMaxValdValid = true;
    2316         246 :                                     break;
    2317             :                                 }
    2318             : 
    2319             :                             // Add to arr if entry not already there.
    2320        3917 :                             if (i == iMaxInd)
    2321             :                             {
    2322        3671 :                                 paVals[iMaxInd] = val;
    2323        3671 :                                 panSums[iMaxInd] = 1;
    2324             : 
    2325        3671 :                                 if (!biMaxValdValid)
    2326             :                                 {
    2327         824 :                                     iMaxVal = iMaxInd;
    2328         824 :                                     biMaxValdValid = true;
    2329             :                                 }
    2330             : 
    2331        3671 :                                 ++iMaxInd;
    2332             :                             }
    2333             :                         }
    2334             :                     }
    2335             :                 }
    2336             : 
    2337         827 :                 if (!biMaxValdValid)
    2338           3 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2339             :                 else
    2340         824 :                     paDstScanline[iDstPixel - nDstXOff] = paVals[iMaxVal];
    2341             :             }
    2342             :             else if constexpr (std::is_same<T, GByte>::value)
    2343             :             // ( eSrcDataType == GDT_Byte && nEntryCount < 256 )
    2344             :             {
    2345             :                 // So we go here for a paletted or non-paletted byte band.
    2346             :                 // The input values are then between 0 and 255.
    2347     4251360 :                 int nMaxVal = 0;
    2348     4251360 :                 int iMaxInd = -1;
    2349             : 
    2350             :                 // The cost of this zeroing might be high. Perhaps we should
    2351             :                 // just use the above generic case, and go to this one if the
    2352             :                 // number of source pixels is large enough
    2353     4251360 :                 std::fill(anVals.begin(), anVals.end(), 0);
    2354             : 
    2355    12777700 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2356             :                 {
    2357     8526370 :                     const GPtrDiff_t iTotYOff =
    2358     8526370 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2359     8526370 :                         nChunkXOff;
    2360    25649400 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2361             :                     {
    2362    17123000 :                         const T val = paSrcScanline[iX + iTotYOff];
    2363    17123000 :                         if (!bHasNoData || val != tNoDataValue)
    2364             :                         {
    2365    17123000 :                             int nVal = static_cast<int>(val);
    2366    17123000 :                             if (++anVals[nVal] > nMaxVal)
    2367             :                             {
    2368             :                                 // Sum the density.
    2369             :                                 // Is it the most common value so far?
    2370    17006300 :                                 iMaxInd = nVal;
    2371    17006300 :                                 nMaxVal = anVals[nVal];
    2372             :                             }
    2373             :                         }
    2374             :                     }
    2375             :                 }
    2376             : 
    2377     4251360 :                 if (iMaxInd == -1)
    2378           0 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2379             :                 else
    2380     4251360 :                     paDstScanline[iDstPixel - nDstXOff] =
    2381             :                         static_cast<T>(iMaxInd);
    2382             :             }
    2383             :         }
    2384             :     }
    2385             : 
    2386         136 :     CPLFree(paVals);
    2387         136 :     CPLFree(panSums);
    2388             : 
    2389         136 :     return CE_None;
    2390             : }
    2391             : 
    2392         136 : static CPLErr GDALResampleChunk_Mode(const GDALOverviewResampleArgs &args,
    2393             :                                      const void *pChunk, void **ppDstBuffer,
    2394             :                                      GDALDataType *peDstBufferDataType)
    2395             : {
    2396         136 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    2397             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    2398             :         GDALGetDataTypeSizeBytes(args.eWrkDataType));
    2399         136 :     if (*ppDstBuffer == nullptr)
    2400             :     {
    2401           0 :         return CE_Failure;
    2402             :     }
    2403             : 
    2404         136 :     CPLAssert(args.eSrcDataType == args.eWrkDataType);
    2405             : 
    2406         136 :     *peDstBufferDataType = args.eWrkDataType;
    2407         136 :     switch (args.eWrkDataType)
    2408             :     {
    2409             :         // For mode resampling, as no computation is done, only the
    2410             :         // size of the data type matters... except for Byte where we have
    2411             :         // special processing. And for floating point values
    2412          65 :         case GDT_Byte:
    2413             :         {
    2414          65 :             return GDALResampleChunk_ModeT(args,
    2415             :                                            static_cast<const GByte *>(pChunk),
    2416          65 :                                            static_cast<GByte *>(*ppDstBuffer));
    2417             :         }
    2418             : 
    2419           4 :         case GDT_Int8:
    2420             :         {
    2421           4 :             return GDALResampleChunk_ModeT(args,
    2422             :                                            static_cast<const int8_t *>(pChunk),
    2423           4 :                                            static_cast<int8_t *>(*ppDstBuffer));
    2424             :         }
    2425             : 
    2426           9 :         case GDT_Int16:
    2427             :         case GDT_UInt16:
    2428             :         case GDT_Float16:
    2429             :         {
    2430           9 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 2);
    2431           9 :             return GDALResampleChunk_ModeT(
    2432             :                 args, static_cast<const uint16_t *>(pChunk),
    2433           9 :                 static_cast<uint16_t *>(*ppDstBuffer));
    2434             :         }
    2435             : 
    2436          15 :         case GDT_CInt16:
    2437             :         case GDT_CFloat16:
    2438             :         case GDT_Int32:
    2439             :         case GDT_UInt32:
    2440             :         {
    2441          15 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2442          15 :             return GDALResampleChunk_ModeT(
    2443             :                 args, static_cast<const uint32_t *>(pChunk),
    2444          15 :                 static_cast<uint32_t *>(*ppDstBuffer));
    2445             :         }
    2446             : 
    2447          17 :         case GDT_Float32:
    2448             :         {
    2449          17 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 4);
    2450          17 :             return GDALResampleChunk_ModeT(args,
    2451             :                                            static_cast<const float *>(pChunk),
    2452          17 :                                            static_cast<float *>(*ppDstBuffer));
    2453             :         }
    2454             : 
    2455          12 :         case GDT_CInt32:
    2456             :         case GDT_Int64:
    2457             :         case GDT_UInt64:
    2458             :         {
    2459          12 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2460          12 :             return GDALResampleChunk_ModeT(
    2461             :                 args, static_cast<const uint64_t *>(pChunk),
    2462          12 :                 static_cast<uint64_t *>(*ppDstBuffer));
    2463             :         }
    2464             : 
    2465           6 :         case GDT_Float64:
    2466             :         {
    2467           6 :             CPLAssert(GDALGetDataTypeSizeBytes(args.eWrkDataType) == 8);
    2468           6 :             return GDALResampleChunk_ModeT(args,
    2469             :                                            static_cast<const double *>(pChunk),
    2470           6 :                                            static_cast<double *>(*ppDstBuffer));
    2471             :         }
    2472             : 
    2473           4 :         case GDT_CFloat32:
    2474             :         {
    2475           4 :             return GDALResampleChunk_ModeT(
    2476             :                 args, static_cast<const std::complex<float> *>(pChunk),
    2477           4 :                 static_cast<std::complex<float> *>(*ppDstBuffer));
    2478             :         }
    2479             : 
    2480           4 :         case GDT_CFloat64:
    2481             :         {
    2482           4 :             return GDALResampleChunk_ModeT(
    2483             :                 args, static_cast<const std::complex<double> *>(pChunk),
    2484           4 :                 static_cast<std::complex<double> *>(*ppDstBuffer));
    2485             :         }
    2486             : 
    2487           0 :         case GDT_Unknown:
    2488             :         case GDT_TypeCount:
    2489           0 :             break;
    2490             :     }
    2491             : 
    2492           0 :     CPLAssert(false);
    2493             :     return CE_Failure;
    2494             : }
    2495             : 
    2496             : /************************************************************************/
    2497             : /*                  GDALResampleConvolutionHorizontal()                 */
    2498             : /************************************************************************/
    2499             : 
    2500             : template <class T>
    2501             : static inline double
    2502       44642 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
    2503             :                                   int nSrcPixelCount)
    2504             : {
    2505       44642 :     double dfVal1 = 0.0;
    2506       44642 :     double dfVal2 = 0.0;
    2507       44642 :     int i = 0;  // Used after for.
    2508             :     // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
    2509             :     // manually (untypical) unrolled loop in -O2 and -O3:
    2510             :     // https://github.com/OSGeo/gdal/issues/9508
    2511             : #if !defined(__INTEL_CLANG_COMPILER)
    2512       89044 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2513             :     {
    2514       44402 :         dfVal1 += pChunk[i] * padfWeights[i];
    2515       44402 :         dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
    2516       44402 :         dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
    2517       44402 :         dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
    2518             :     }
    2519             : #endif
    2520       46066 :     for (; i < nSrcPixelCount; ++i)
    2521             :     {
    2522        1424 :         dfVal1 += pChunk[i] * padfWeights[i];
    2523             :     }
    2524       44642 :     return dfVal1 + dfVal2;
    2525             : }
    2526             : 
    2527             : template <class T>
    2528          48 : static inline void GDALResampleConvolutionHorizontalWithMask(
    2529             :     const T *pChunk, const GByte *pabyMask, const double *padfWeights,
    2530             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2531             : {
    2532          48 :     dfVal = 0;
    2533          48 :     dfWeightSum = 0;
    2534          48 :     int i = 0;
    2535          48 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2536             :     {
    2537           0 :         const double dfWeight0 = padfWeights[i] * pabyMask[i];
    2538           0 :         const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
    2539           0 :         const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
    2540           0 :         const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
    2541           0 :         dfVal += pChunk[i] * dfWeight0;
    2542           0 :         dfVal += pChunk[i + 1] * dfWeight1;
    2543           0 :         dfVal += pChunk[i + 2] * dfWeight2;
    2544           0 :         dfVal += pChunk[i + 3] * dfWeight3;
    2545           0 :         dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
    2546             :     }
    2547         178 :     for (; i < nSrcPixelCount; ++i)
    2548             :     {
    2549         130 :         const double dfWeight = padfWeights[i] * pabyMask[i];
    2550         130 :         dfVal += pChunk[i] * dfWeight;
    2551         130 :         dfWeightSum += dfWeight;
    2552             :     }
    2553          48 : }
    2554             : 
    2555             : template <class T>
    2556     1330334 : static inline void GDALResampleConvolutionHorizontal_3rows(
    2557             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2558             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2559             :     double &dfRes2, double &dfRes3)
    2560             : {
    2561     1330334 :     double dfVal1 = 0.0;
    2562     1330334 :     double dfVal2 = 0.0;
    2563     1330334 :     double dfVal3 = 0.0;
    2564     1330334 :     double dfVal4 = 0.0;
    2565     1330334 :     double dfVal5 = 0.0;
    2566     1330334 :     double dfVal6 = 0.0;
    2567     1330334 :     int i = 0;  // Used after for.
    2568     2715057 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2569             :     {
    2570     1384722 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2571     1384722 :         dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
    2572     1384722 :         dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
    2573     1384722 :         dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
    2574     1384722 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2575     1384722 :         dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
    2576     1384722 :         dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
    2577     1384722 :         dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
    2578     1384722 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2579     1384722 :         dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
    2580     1384722 :         dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
    2581     1384722 :         dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
    2582             :     }
    2583     1366941 :     for (; i < nSrcPixelCount; ++i)
    2584             :     {
    2585       36607 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2586       36607 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2587       36607 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2588             :     }
    2589     1330334 :     dfRes1 = dfVal1 + dfVal2;
    2590     1330334 :     dfRes2 = dfVal3 + dfVal4;
    2591     1330334 :     dfRes3 = dfVal5 + dfVal6;
    2592     1330334 : }
    2593             : 
    2594             : template <class T>
    2595       18188 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    2596             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2597             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2598             :     double &dfRes2, double &dfRes3)
    2599             : {
    2600       18188 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2601             :                                             padfWeights, nSrcPixelCount, dfRes1,
    2602             :                                             dfRes2, dfRes3);
    2603       18188 : }
    2604             : 
    2605             : template <class T>
    2606     1247346 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
    2607             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2608             :     const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
    2609             : {
    2610     1247346 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2611             :                                             padfWeights, 4, dfRes1, dfRes2,
    2612             :                                             dfRes3);
    2613     1247346 : }
    2614             : 
    2615             : /************************************************************************/
    2616             : /*                  GDALResampleConvolutionVertical()                   */
    2617             : /************************************************************************/
    2618             : 
    2619             : template <class T>
    2620             : static inline double
    2621      463524 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
    2622             :                                 const double *padfWeights, int nSrcLineCount)
    2623             : {
    2624      463524 :     double dfVal1 = 0.0;
    2625      463524 :     double dfVal2 = 0.0;
    2626      463524 :     int i = 0;
    2627      463524 :     int j = 0;
    2628      912750 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2629             :     {
    2630      449226 :         dfVal1 += pChunk[j] * padfWeights[i];
    2631      449226 :         dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
    2632      449226 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2633      449226 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2634             :     }
    2635      516491 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2636             :     {
    2637       52967 :         dfVal1 += pChunk[j] * padfWeights[i];
    2638             :     }
    2639      463524 :     return dfVal1 + dfVal2;
    2640             : }
    2641             : 
    2642             : template <class T>
    2643     2880000 : static inline void GDALResampleConvolutionVertical_2cols(
    2644             :     const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
    2645             :     double &dfRes1, double &dfRes2)
    2646             : {
    2647     2880000 :     double dfVal1 = 0.0;
    2648     2880000 :     double dfVal2 = 0.0;
    2649     2880000 :     double dfVal3 = 0.0;
    2650     2880000 :     double dfVal4 = 0.0;
    2651     2880000 :     int i = 0;
    2652     2880000 :     int j = 0;
    2653     5716800 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2654             :     {
    2655     2836800 :         dfVal1 += pChunk[j] * padfWeights[i];
    2656     2836800 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2657     2836800 :         dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
    2658     2836800 :         dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
    2659     2836800 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2660     2836800 :         dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
    2661     2836800 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2662     2836800 :         dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
    2663             :     }
    2664     2995210 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2665             :     {
    2666      115210 :         dfVal1 += pChunk[j] * padfWeights[i];
    2667      115210 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2668             :     }
    2669     2880000 :     dfRes1 = dfVal1 + dfVal2;
    2670     2880000 :     dfRes2 = dfVal3 + dfVal4;
    2671     2880000 : }
    2672             : 
    2673             : #ifdef USE_SSE2
    2674             : 
    2675             : #ifdef __AVX__
    2676             : /************************************************************************/
    2677             : /*             GDALResampleConvolutionVertical_16cols<T>                */
    2678             : /************************************************************************/
    2679             : 
    2680             : template <class T>
    2681             : static inline void
    2682             : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
    2683             :                                        const double *padfWeights,
    2684             :                                        int nSrcLineCount, float *afDest)
    2685             : {
    2686             :     int i = 0;
    2687             :     int j = 0;
    2688             :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2689             :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2690             :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2691             :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2692             :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2693             :     {
    2694             :         XMMReg4Double w0 =
    2695             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2696             :         XMMReg4Double w1 =
    2697             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2698             :         XMMReg4Double w2 =
    2699             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2700             :         XMMReg4Double w3 =
    2701             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2702             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2703             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2704             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
    2705             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
    2706             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2707             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2708             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
    2709             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
    2710             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2711             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2712             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
    2713             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
    2714             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2715             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2716             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
    2717             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
    2718             :     }
    2719             :     for (; i < nSrcLineCount; ++i, j += nStride)
    2720             :     {
    2721             :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2722             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2723             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2724             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
    2725             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
    2726             :     }
    2727             :     v_acc0.Store4Val(afDest);
    2728             :     v_acc1.Store4Val(afDest + 4);
    2729             :     v_acc2.Store4Val(afDest + 8);
    2730             :     v_acc3.Store4Val(afDest + 12);
    2731             : }
    2732             : 
    2733             : template <class T>
    2734             : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
    2735             :                                                           const double *, int,
    2736             :                                                           double *)
    2737             : {
    2738             :     // Cannot be reached
    2739             :     CPLAssert(false);
    2740             : }
    2741             : 
    2742             : #else
    2743             : 
    2744             : /************************************************************************/
    2745             : /*              GDALResampleConvolutionVertical_8cols<T>                */
    2746             : /************************************************************************/
    2747             : 
    2748             : template <class T>
    2749             : static inline void
    2750    18635500 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
    2751             :                                       const double *padfWeights,
    2752             :                                       int nSrcLineCount, float *afDest)
    2753             : {
    2754    18635500 :     int i = 0;
    2755    18635500 :     int j = 0;
    2756    18635500 :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2757    18555800 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2758    33743500 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2759             :     {
    2760    15186500 :         XMMReg4Double w0 =
    2761    15186500 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2762    15161600 :         XMMReg4Double w1 =
    2763    15161600 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2764    15190900 :         XMMReg4Double w2 =
    2765    15190900 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2766    15196200 :         XMMReg4Double w3 =
    2767    15196200 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2768    15189700 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2769    15119900 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2770    15129500 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2771    15115400 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2772    15117000 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2773    15119300 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2774    15119400 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2775    15126900 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2776             :     }
    2777    29964300 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2778             :     {
    2779    11407300 :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2780    11407300 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2781    11407300 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2782             :     }
    2783    18557000 :     v_acc0.Store4Val(afDest);
    2784    18594800 :     v_acc1.Store4Val(afDest + 4);
    2785    18611400 : }
    2786             : 
    2787             : template <class T>
    2788             : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
    2789             :                                                          const double *, int,
    2790             :                                                          double *)
    2791             : {
    2792             :     // Cannot be reached
    2793             :     CPLAssert(false);
    2794             : }
    2795             : 
    2796             : #endif  // __AVX__
    2797             : 
    2798             : /************************************************************************/
    2799             : /*              GDALResampleConvolutionHorizontalSSE2<T>                */
    2800             : /************************************************************************/
    2801             : 
    2802             : template <class T>
    2803     2738105 : static inline double GDALResampleConvolutionHorizontalSSE2(
    2804             :     const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2805             : {
    2806     2738105 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2807     2737728 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2808     2737892 :     int i = 0;  // Used after for.
    2809     2814061 :     for (; i + 7 < nSrcPixelCount; i += 8)
    2810             :     {
    2811             :         // Retrieve the pixel & accumulate
    2812       76083 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
    2813       76083 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
    2814       76083 :         const XMMReg4Double v_weight1 =
    2815       76083 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2816       76083 :         const XMMReg4Double v_weight2 =
    2817       76083 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2818             : 
    2819       76083 :         v_acc1 += v_pixels1 * v_weight1;
    2820       76083 :         v_acc2 += v_pixels2 * v_weight2;
    2821             :     }
    2822             : 
    2823     2737969 :     v_acc1 += v_acc2;
    2824             : 
    2825     2737823 :     double dfVal = v_acc1.GetHorizSum();
    2826     9505350 :     for (; i < nSrcPixelCount; ++i)
    2827             :     {
    2828     6767790 :         dfVal += pChunk[i] * padfWeightsAligned[i];
    2829             :     }
    2830     2737566 :     return dfVal;
    2831             : }
    2832             : 
    2833             : /************************************************************************/
    2834             : /*              GDALResampleConvolutionHorizontal<GByte>                */
    2835             : /************************************************************************/
    2836             : 
    2837             : template <>
    2838     2189920 : inline double GDALResampleConvolutionHorizontal<GByte>(
    2839             :     const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2840             : {
    2841     2189920 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2842     2189930 :                                                  nSrcPixelCount);
    2843             : }
    2844             : 
    2845             : template <>
    2846      548287 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
    2847             :     const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2848             : {
    2849      548287 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2850      548501 :                                                  nSrcPixelCount);
    2851             : }
    2852             : 
    2853             : /************************************************************************/
    2854             : /*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
    2855             : /************************************************************************/
    2856             : 
    2857             : template <class T>
    2858     5806833 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
    2859             :     const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
    2860             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2861             : {
    2862     5806833 :     int i = 0;  // Used after for.
    2863     5806833 :     XMMReg4Double v_acc = XMMReg4Double::Zero();
    2864     5806833 :     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
    2865    16456921 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2866             :     {
    2867    10650058 :         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
    2868    10650058 :         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
    2869    10650058 :         XMMReg4Double v_weight =
    2870    10650058 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2871    10650058 :         v_weight *= v_mask;
    2872    10650058 :         v_acc += v_pixels * v_weight;
    2873    10650058 :         v_acc_weight += v_weight;
    2874             :     }
    2875             : 
    2876     5806833 :     dfVal = v_acc.GetHorizSum();
    2877     5806833 :     dfWeightSum = v_acc_weight.GetHorizSum();
    2878     6005033 :     for (; i < nSrcPixelCount; ++i)
    2879             :     {
    2880      198202 :         const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
    2881      198202 :         dfVal += pChunk[i] * dfWeight;
    2882      198202 :         dfWeightSum += dfWeight;
    2883             :     }
    2884     5806833 : }
    2885             : 
    2886             : /************************************************************************/
    2887             : /*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
    2888             : /************************************************************************/
    2889             : 
    2890             : template <>
    2891     5806770 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
    2892             :     const GByte *pChunk, const GByte *pabyMask,
    2893             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2894             :     double &dfWeightSum)
    2895             : {
    2896     5806770 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2897             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2898             :         dfWeightSum);
    2899     5806770 : }
    2900             : 
    2901             : template <>
    2902          63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
    2903             :     const GUInt16 *pChunk, const GByte *pabyMask,
    2904             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2905             :     double &dfWeightSum)
    2906             : {
    2907          63 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2908             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2909             :         dfWeightSum);
    2910          63 : }
    2911             : 
    2912             : /************************************************************************/
    2913             : /*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
    2914             : /************************************************************************/
    2915             : 
    2916             : template <class T>
    2917    10026330 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
    2918             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2919             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2920             :     double &dfRes2, double &dfRes3)
    2921             : {
    2922    10026330 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
    2923    10026330 :                   v_acc2 = XMMReg4Double::Zero(),
    2924    10026330 :                   v_acc3 = XMMReg4Double::Zero();
    2925    10026330 :     int i = 0;
    2926    19994966 :     for (; i + 7 < nSrcPixelCount; i += 8)
    2927             :     {
    2928             :         // Retrieve the pixel & accumulate.
    2929     9968616 :         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    2930     9968616 :         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
    2931     9968616 :         const XMMReg4Double v_weight1 =
    2932     9968616 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2933     9968616 :         const XMMReg4Double v_weight2 =
    2934     9968616 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2935             : 
    2936     9968616 :         v_acc1 += v_pixels1 * v_weight1;
    2937     9968616 :         v_acc1 += v_pixels2 * v_weight2;
    2938             : 
    2939     9968616 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    2940     9968616 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
    2941     9968616 :         v_acc2 += v_pixels1 * v_weight1;
    2942     9968616 :         v_acc2 += v_pixels2 * v_weight2;
    2943             : 
    2944     9968616 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    2945     9968616 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
    2946     9968616 :         v_acc3 += v_pixels1 * v_weight1;
    2947     9968616 :         v_acc3 += v_pixels2 * v_weight2;
    2948             :     }
    2949             : 
    2950    10026330 :     dfRes1 = v_acc1.GetHorizSum();
    2951    10026330 :     dfRes2 = v_acc2.GetHorizSum();
    2952    10026330 :     dfRes3 = v_acc3.GetHorizSum();
    2953    21492926 :     for (; i < nSrcPixelCount; ++i)
    2954             :     {
    2955    11466596 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    2956    11466596 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    2957    11466596 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    2958             :     }
    2959    10026330 : }
    2960             : 
    2961             : /************************************************************************/
    2962             : /*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
    2963             : /************************************************************************/
    2964             : 
    2965             : template <>
    2966    10026300 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
    2967             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    2968             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2969             :     double &dfRes2, double &dfRes3)
    2970             : {
    2971    10026300 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2972             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2973             :         dfRes1, dfRes2, dfRes3);
    2974    10026300 : }
    2975             : 
    2976             : template <>
    2977          30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
    2978             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    2979             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    2980             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    2981             : {
    2982          30 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2983             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2984             :         dfRes1, dfRes2, dfRes3);
    2985          30 : }
    2986             : 
    2987             : /************************************************************************/
    2988             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
    2989             : /************************************************************************/
    2990             : 
    2991             : template <class T>
    2992     2173246 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    2993             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2994             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2995             :     double &dfRes2, double &dfRes3)
    2996             : {
    2997     2173246 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2998     2173020 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2999     2173127 :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    3000     2173118 :     int i = 0;  // Use after for.
    3001     2176437 :     for (; i + 3 < nSrcPixelCount; i += 4)
    3002             :     {
    3003             :         // Retrieve the pixel & accumulate.
    3004        3284 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    3005        3284 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    3006        3284 :         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    3007        3284 :         const XMMReg4Double v_weight =
    3008        3284 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    3009             : 
    3010        3284 :         v_acc1 += v_pixels1 * v_weight;
    3011        3284 :         v_acc2 += v_pixels2 * v_weight;
    3012        3284 :         v_acc3 += v_pixels3 * v_weight;
    3013             :     }
    3014             : 
    3015     2173157 :     dfRes1 = v_acc1.GetHorizSum();
    3016     2173041 :     dfRes2 = v_acc2.GetHorizSum();
    3017     2173053 :     dfRes3 = v_acc3.GetHorizSum();
    3018             : 
    3019     6494380 :     for (; i < nSrcPixelCount; ++i)
    3020             :     {
    3021     4321322 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    3022     4321322 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    3023     4321322 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    3024             :     }
    3025     2173058 : }
    3026             : 
    3027             : /************************************************************************/
    3028             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
    3029             : /************************************************************************/
    3030             : 
    3031             : template <>
    3032     2106390 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
    3033             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3034             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    3035             :     double &dfRes2, double &dfRes3)
    3036             : {
    3037     2106390 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3038             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3039             :         dfRes1, dfRes2, dfRes3);
    3040     2106400 : }
    3041             : 
    3042             : template <>
    3043       66750 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
    3044             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3045             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    3046             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    3047             : {
    3048       66750 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    3049             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    3050             :         dfRes1, dfRes2, dfRes3);
    3051       66903 : }
    3052             : 
    3053             : /************************************************************************/
    3054             : /*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
    3055             : /************************************************************************/
    3056             : 
    3057             : template <class T>
    3058    12200610 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3059             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    3060             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3061             :     double &dfRes3)
    3062             : {
    3063    12200610 :     const XMMReg4Double v_weight =
    3064             :         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
    3065             : 
    3066             :     // Retrieve the pixel & accumulate.
    3067    12153940 :     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
    3068    12224670 :     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
    3069    12190940 :     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
    3070             : 
    3071    12237040 :     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
    3072    12182230 :     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
    3073    12186190 :     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
    3074             : 
    3075    12178950 :     dfRes1 = v_acc1.GetHorizSum();
    3076    12161000 :     dfRes2 = v_acc2.GetHorizSum();
    3077    12178770 :     dfRes3 = v_acc3.GetHorizSum();
    3078    12204550 : }
    3079             : 
    3080             : /************************************************************************/
    3081             : /*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
    3082             : /************************************************************************/
    3083             : 
    3084             : template <>
    3085     6625740 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
    3086             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    3087             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    3088             :     double &dfRes3)
    3089             : {
    3090     6625740 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3091             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3092             :         dfRes3);
    3093     6613610 : }
    3094             : 
    3095             : template <>
    3096     5600910 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
    3097             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    3098             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
    3099             :     double &dfRes2, double &dfRes3)
    3100             : {
    3101     5600910 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    3102             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    3103             :         dfRes3);
    3104     5573700 : }
    3105             : 
    3106             : #endif  // USE_SSE2
    3107             : 
    3108             : /************************************************************************/
    3109             : /*                    GDALResampleChunk_Convolution()                   */
    3110             : /************************************************************************/
    3111             : 
    3112             : template <class T, class Twork, GDALDataType eWrkDataType>
    3113        3700 : static CPLErr GDALResampleChunk_ConvolutionT(
    3114             :     const GDALOverviewResampleArgs &args, const T *pChunk, void *pDstBuffer,
    3115             :     FilterFuncType pfnFilterFunc, FilterFunc4ValuesType pfnFilterFunc4Values,
    3116             :     int nKernelRadius, bool bKernelWithNegativeWeights, float fMaxVal)
    3117             : 
    3118             : {
    3119        3700 :     const double dfXRatioDstToSrc = args.dfXRatioDstToSrc;
    3120        3700 :     const double dfYRatioDstToSrc = args.dfYRatioDstToSrc;
    3121        3700 :     const double dfSrcXDelta = args.dfSrcXDelta;
    3122        3700 :     const double dfSrcYDelta = args.dfSrcYDelta;
    3123        3700 :     constexpr int nBands = 1;
    3124        3700 :     const GByte *pabyChunkNodataMask = args.pabyChunkNodataMask;
    3125        3700 :     const int nChunkXOff = args.nChunkXOff;
    3126        3700 :     const int nChunkXSize = args.nChunkXSize;
    3127        3700 :     const int nChunkYOff = args.nChunkYOff;
    3128        3700 :     const int nChunkYSize = args.nChunkYSize;
    3129        3700 :     const int nDstXOff = args.nDstXOff;
    3130        3700 :     const int nDstXOff2 = args.nDstXOff2;
    3131        3700 :     const int nDstYOff = args.nDstYOff;
    3132        3700 :     const int nDstYOff2 = args.nDstYOff2;
    3133        3700 :     const bool bHasNoData = args.bHasNoData;
    3134        3700 :     double dfNoDataValue = args.dfNoDataValue;
    3135             : 
    3136        3700 :     if (!bHasNoData)
    3137        3649 :         dfNoDataValue = 0.0;
    3138        3700 :     const auto dstDataType = args.eOvrDataType;
    3139        3700 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
    3140        3696 :     const double dfReplacementVal =
    3141          46 :         bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
    3142             :                    : dfNoDataValue;
    3143             :     // cppcheck-suppress unreadVariable
    3144        3696 :     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
    3145        3690 :     const auto nNodataValueInt64 = static_cast<GInt64>(dfNoDataValue);
    3146        3690 :     constexpr int nWrkDataTypeSize = static_cast<int>(sizeof(Twork));
    3147             : 
    3148             :     // TODO: we should have some generic function to do this.
    3149        3690 :     Twork fDstMin = cpl::NumericLimits<Twork>::lowest();
    3150        3690 :     Twork fDstMax = cpl::NumericLimits<Twork>::max();
    3151        3690 :     if (dstDataType == GDT_Byte)
    3152             :     {
    3153        2977 :         fDstMin = std::numeric_limits<GByte>::min();
    3154        2975 :         fDstMax = std::numeric_limits<GByte>::max();
    3155             :     }
    3156         715 :     else if (dstDataType == GDT_Int8)
    3157             :     {
    3158           1 :         fDstMin = std::numeric_limits<GInt8>::min();
    3159           1 :         fDstMax = std::numeric_limits<GInt8>::max();
    3160             :     }
    3161         714 :     else if (dstDataType == GDT_UInt16)
    3162             :     {
    3163         386 :         fDstMin = std::numeric_limits<GUInt16>::min();
    3164         388 :         fDstMax = std::numeric_limits<GUInt16>::max();
    3165             :     }
    3166         329 :     else if (dstDataType == GDT_Int16)
    3167             :     {
    3168         279 :         fDstMin = std::numeric_limits<GInt16>::min();
    3169         279 :         fDstMax = std::numeric_limits<GInt16>::max();
    3170             :     }
    3171          50 :     else if (dstDataType == GDT_UInt32)
    3172             :     {
    3173           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
    3174           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
    3175             :     }
    3176          49 :     else if (dstDataType == GDT_Int32)
    3177             :     {
    3178             :         // cppcheck-suppress unreadVariable
    3179           2 :         fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
    3180             :         // cppcheck-suppress unreadVariable
    3181           2 :         fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
    3182             :     }
    3183          47 :     else if (dstDataType == GDT_UInt64)
    3184             :     {
    3185             :         // cppcheck-suppress unreadVariable
    3186           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<uint64_t>::min());
    3187             :         // cppcheck-suppress unreadVariable
    3188           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<uint64_t>::max());
    3189             :     }
    3190          46 :     else if (dstDataType == GDT_Int64)
    3191             :     {
    3192             :         // cppcheck-suppress unreadVariable
    3193           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<int64_t>::min());
    3194             :         // cppcheck-suppress unreadVariable
    3195           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<int64_t>::max());
    3196             :     }
    3197             : 
    3198    27580835 :     auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
    3199             :                                nNodataValueInt64, dfNoDataValue,
    3200             :                                dfReplacementVal](Twork fVal)
    3201             :     {
    3202    14670600 :         if (!bHasNoData)
    3203    11444200 :             return fVal;
    3204             : 
    3205             :         // Clamp value before comparing to nodata: this is only needed for
    3206             :         // kernels with negative weights (Lanczos)
    3207     3226390 :         Twork fClamped = fVal;
    3208     3226390 :         if (fClamped < fDstMin)
    3209       12874 :             fClamped = fDstMin;
    3210     3213520 :         else if (fClamped > fDstMax)
    3211       12852 :             fClamped = fDstMax;
    3212     3226390 :         if (isIntegerDT)
    3213             :         {
    3214     3226370 :             if (nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
    3215             :             {
    3216             :                 // Do not use the nodata value
    3217       13869 :                 return static_cast<Twork>(dfReplacementVal);
    3218             :             }
    3219             :         }
    3220          24 :         else if (dfNoDataValue == fClamped)
    3221             :         {
    3222             :             // Do not use the nodata value
    3223           1 :             return static_cast<Twork>(dfReplacementVal);
    3224             :         }
    3225     3212520 :         return fClamped;
    3226             :     };
    3227             : 
    3228             :     /* -------------------------------------------------------------------- */
    3229             :     /*      Allocate work buffers.                                          */
    3230             :     /* -------------------------------------------------------------------- */
    3231        3686 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    3232        3686 :     Twork *pafWrkScanline = nullptr;
    3233        3686 :     if (dstDataType != eWrkDataType)
    3234             :     {
    3235             :         pafWrkScanline =
    3236        3646 :             static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
    3237        3656 :         if (pafWrkScanline == nullptr)
    3238           0 :             return CE_Failure;
    3239             :     }
    3240             : 
    3241        3696 :     const double dfXScale = 1.0 / dfXRatioDstToSrc;
    3242        3696 :     const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
    3243        3696 :     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
    3244        3696 :     const double dfYScale = 1.0 / dfYRatioDstToSrc;
    3245        3696 :     const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
    3246        3696 :     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
    3247             : 
    3248             :     // Temporary array to store result of horizontal filter.
    3249             :     double *padfHorizontalFiltered = static_cast<double *>(
    3250        3696 :         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
    3251             : 
    3252             :     // To store convolution coefficients.
    3253        3697 :     double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
    3254             :         static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
    3255             :                          0.5) *
    3256             :         sizeof(double)));
    3257             : 
    3258        3697 :     GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
    3259        3697 :     if (pabyChunkNodataMask)
    3260             :         pabyChunkNodataMaskHorizontalFiltered =
    3261         401 :             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
    3262        3697 :     if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
    3263         401 :         (pabyChunkNodataMask != nullptr &&
    3264             :          pabyChunkNodataMaskHorizontalFiltered == nullptr))
    3265             :     {
    3266           2 :         VSIFree(pafWrkScanline);
    3267           0 :         VSIFree(padfHorizontalFiltered);
    3268           0 :         VSIFreeAligned(padfWeights);
    3269           0 :         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3270           0 :         return CE_Failure;
    3271             :     }
    3272             : 
    3273             :     /* ==================================================================== */
    3274             :     /*      First pass: horizontal filter                                   */
    3275             :     /* ==================================================================== */
    3276        3695 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    3277             : #ifdef USE_SSE2
    3278        3695 :     bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
    3279             : #endif
    3280     2724976 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    3281             :     {
    3282     2721270 :         const double dfSrcPixel =
    3283     2721270 :             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
    3284     2721270 :         int nSrcPixelStart =
    3285     2721270 :             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
    3286     2721270 :         if (nSrcPixelStart < nChunkXOff)
    3287       55170 :             nSrcPixelStart = nChunkXOff;
    3288     2721270 :         int nSrcPixelStop =
    3289     2721270 :             static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
    3290     2721270 :         if (nSrcPixelStop > nChunkRightXOff)
    3291       55188 :             nSrcPixelStop = nChunkRightXOff;
    3292             : #if 0
    3293             :         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
    3294             :         {
    3295             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3296             :         }
    3297             :         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
    3298             :         {
    3299             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3300             :         }
    3301             : #endif
    3302     2721270 :         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
    3303     2721270 :         double dfWeightSum = 0.0;
    3304             : 
    3305             :         // Compute convolution coefficients.
    3306     2721270 :         int nSrcPixel = nSrcPixelStart;
    3307     2721270 :         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
    3308     3568066 :         for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
    3309             :         {
    3310      846729 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
    3311      846729 :             dfX += dfXScaleWeight;
    3312      846729 :             padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
    3313      846729 :             dfX += dfXScaleWeight;
    3314      846729 :             padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
    3315      846729 :             dfX += dfXScaleWeight;
    3316      846729 :             padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
    3317      846729 :             dfX += dfXScaleWeight;
    3318      846792 :             dfWeightSum +=
    3319      846729 :                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
    3320             :         }
    3321     6702788 :         for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
    3322             :         {
    3323     3981677 :             const double dfWeight = pfnFilterFunc(dfX);
    3324     3981453 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
    3325     3981453 :             dfWeightSum += dfWeight;
    3326             :         }
    3327             : 
    3328     2721111 :         const int nHeight = nChunkYSize * nBands;
    3329     2721111 :         if (pabyChunkNodataMask == nullptr)
    3330             :         {
    3331     2648768 :             if (dfWeightSum != 0)
    3332             :             {
    3333     2648771 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3334     9456113 :                 for (int i = 0; i < nSrcPixelCount; ++i)
    3335     6807345 :                     padfWeights[i] *= dfInvWeightSum;
    3336             :             }
    3337     2648768 :             int iSrcLineOff = 0;
    3338             : #ifdef USE_SSE2
    3339     2648768 :             if (nSrcPixelCount == 4)
    3340             :             {
    3341    13987066 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3342             :                 {
    3343    13447536 :                     const GPtrDiff_t j =
    3344    13447536 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3345    13447536 :                         (nSrcPixelStart - nChunkXOff);
    3346    13447536 :                     double dfVal1 = 0.0;
    3347    13447536 :                     double dfVal2 = 0.0;
    3348    13447536 :                     double dfVal3 = 0.0;
    3349    13447536 :                     GDALResampleConvolutionHorizontalPixelCount4_3rows(
    3350    13447536 :                         pChunk + j, pChunk + j + nChunkXSize,
    3351    13447536 :                         pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
    3352             :                         dfVal2, dfVal3);
    3353    13451656 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3354    13451656 :                                                nDstXSize +
    3355    13451656 :                                            iDstPixel - nDstXOff] = dfVal1;
    3356    13451656 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3357    13451656 :                                             1) *
    3358    13451656 :                                                nDstXSize +
    3359    13451656 :                                            iDstPixel - nDstXOff] = dfVal2;
    3360    13451656 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3361    13451656 :                                             2) *
    3362    13451656 :                                                nDstXSize +
    3363    13451656 :                                            iDstPixel - nDstXOff] = dfVal3;
    3364             :                 }
    3365             :             }
    3366     2113350 :             else if (bSrcPixelCountLess8)
    3367             :             {
    3368     4226190 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3369             :                 {
    3370     2191224 :                     const GPtrDiff_t j =
    3371     2191224 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3372     2191224 :                         (nSrcPixelStart - nChunkXOff);
    3373     2191224 :                     double dfVal1 = 0.0;
    3374     2191224 :                     double dfVal2 = 0.0;
    3375     2191224 :                     double dfVal3 = 0.0;
    3376     2191224 :                     GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    3377     2191224 :                         pChunk + j, pChunk + j + nChunkXSize,
    3378     2191224 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3379             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3380     2191453 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3381     2191453 :                                                nDstXSize +
    3382     2191453 :                                            iDstPixel - nDstXOff] = dfVal1;
    3383     2191453 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3384     2191453 :                                             1) *
    3385     2191453 :                                                nDstXSize +
    3386     2191453 :                                            iDstPixel - nDstXOff] = dfVal2;
    3387     2191453 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3388     2191453 :                                             2) *
    3389     2191453 :                                                nDstXSize +
    3390     2191453 :                                            iDstPixel - nDstXOff] = dfVal3;
    3391             :                 }
    3392             :             }
    3393             :             else
    3394             : #endif
    3395             :             {
    3396    10169733 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3397             :                 {
    3398    10091130 :                     const GPtrDiff_t j =
    3399    10091130 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3400    10091130 :                         (nSrcPixelStart - nChunkXOff);
    3401    10091130 :                     double dfVal1 = 0.0;
    3402    10091130 :                     double dfVal2 = 0.0;
    3403    10091130 :                     double dfVal3 = 0.0;
    3404    10091130 :                     GDALResampleConvolutionHorizontal_3rows(
    3405    10091130 :                         pChunk + j, pChunk + j + nChunkXSize,
    3406    10091130 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3407             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3408    10091130 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3409    10091130 :                                                nDstXSize +
    3410    10091130 :                                            iDstPixel - nDstXOff] = dfVal1;
    3411    10091130 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3412    10091130 :                                             1) *
    3413    10091130 :                                                nDstXSize +
    3414    10091130 :                                            iDstPixel - nDstXOff] = dfVal2;
    3415    10091130 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3416    10091130 :                                             2) *
    3417    10091130 :                                                nDstXSize +
    3418    10091130 :                                            iDstPixel - nDstXOff] = dfVal3;
    3419             :                 }
    3420             :             }
    3421     5436193 :             for (; iSrcLineOff < nHeight; ++iSrcLineOff)
    3422             :             {
    3423     2782818 :                 const GPtrDiff_t j =
    3424     2782818 :                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3425     2782818 :                     (nSrcPixelStart - nChunkXOff);
    3426     5521262 :                 const double dfVal = GDALResampleConvolutionHorizontal(
    3427     2782818 :                     pChunk + j, padfWeights, nSrcPixelCount);
    3428     2783089 :                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3429     2783089 :                                            nDstXSize +
    3430     2783089 :                                        iDstPixel - nDstXOff] = dfVal;
    3431             :             }
    3432             :         }
    3433             :         else
    3434             :         {
    3435    18407872 :             for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
    3436             :             {
    3437    18333218 :                 const GPtrDiff_t j =
    3438    18333218 :                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3439    18333218 :                     (nSrcPixelStart - nChunkXOff);
    3440             : 
    3441    18333218 :                 if (bKernelWithNegativeWeights)
    3442             :                 {
    3443    17852612 :                     int nConsecutiveValid = 0;
    3444    17852612 :                     int nMaxConsecutiveValid = 0;
    3445   165500458 :                     for (int k = 0; k < nSrcPixelCount; k++)
    3446             :                     {
    3447   147648146 :                         if (pabyChunkNodataMask[j + k])
    3448    40762353 :                             nConsecutiveValid++;
    3449   106885793 :                         else if (nConsecutiveValid)
    3450             :                         {
    3451      105332 :                             nMaxConsecutiveValid = std::max(
    3452      105332 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3453      105332 :                             nConsecutiveValid = 0;
    3454             :                         }
    3455             :                     }
    3456    17852612 :                     nMaxConsecutiveValid =
    3457    17852612 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3458    17852612 :                     if (nMaxConsecutiveValid < nSrcPixelCount / 2)
    3459             :                     {
    3460    12526307 :                         const size_t nTempOffset =
    3461    12526307 :                             static_cast<size_t>(iSrcLineOff) * nDstXSize +
    3462    12526307 :                             iDstPixel - nDstXOff;
    3463    12526307 :                         padfHorizontalFiltered[nTempOffset] = 0.0;
    3464    12526307 :                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3465    12526307 :                         continue;
    3466             :                     }
    3467             :                 }
    3468             : 
    3469     5806881 :                 double dfVal = 0.0;
    3470     5806881 :                 GDALResampleConvolutionHorizontalWithMask(
    3471     5806881 :                     pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3472             :                     nSrcPixelCount, dfVal, dfWeightSum);
    3473     5809278 :                 const size_t nTempOffset =
    3474     5809278 :                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
    3475     5809278 :                     nDstXOff;
    3476     5809278 :                 if (dfWeightSum > 0.0)
    3477             :                 {
    3478     5762218 :                     padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
    3479     5762218 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
    3480             :                 }
    3481             :                 else
    3482             :                 {
    3483       47115 :                     padfHorizontalFiltered[nTempOffset] = 0.0;
    3484       47115 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3485             :                 }
    3486             :             }
    3487             :         }
    3488             :     }
    3489             : 
    3490             :     /* ==================================================================== */
    3491             :     /*      Second pass: vertical filter                                    */
    3492             :     /* ==================================================================== */
    3493        3703 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    3494             : 
    3495      198221 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3496             :     {
    3497      194518 :         Twork *const pafDstScanline =
    3498      194518 :             pafWrkScanline ? pafWrkScanline
    3499        8421 :                            : static_cast<Twork *>(pDstBuffer) +
    3500        8421 :                                  (iDstLine - nDstYOff) * nDstXSize;
    3501             : 
    3502      194518 :         const double dfSrcLine =
    3503      194518 :             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
    3504      194518 :         int nSrcLineStart =
    3505      194518 :             static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
    3506      194518 :         int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
    3507      194518 :         if (nSrcLineStart < nChunkYOff)
    3508        2345 :             nSrcLineStart = nChunkYOff;
    3509      194518 :         if (nSrcLineStop > nChunkBottomYOff)
    3510        2381 :             nSrcLineStop = nChunkBottomYOff;
    3511             : #if 0
    3512             :         if( nSrcLineStart < nChunkYOff &&
    3513             :             nChunkYOff > 0 )
    3514             :         {
    3515             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3516             :         }
    3517             :         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
    3518             :         {
    3519             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3520             :         }
    3521             : #endif
    3522      194518 :         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
    3523      194518 :         double dfWeightSum = 0.0;
    3524             : 
    3525             :         // Compute convolution coefficients.
    3526      194518 :         int nSrcLine = nSrcLineStart;  // Used after for.
    3527      194518 :         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
    3528      432222 :         for (; nSrcLine + 3 < nSrcLineStop;
    3529      237704 :              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
    3530             :         {
    3531      237713 :             padfWeights[nSrcLine - nSrcLineStart] = dfY;
    3532      237713 :             padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
    3533      237713 :             padfWeights[nSrcLine + 2 - nSrcLineStart] =
    3534      237713 :                 dfY + 2 * dfYScaleWeight;
    3535      237713 :             padfWeights[nSrcLine + 3 - nSrcLineStart] =
    3536      237713 :                 dfY + 3 * dfYScaleWeight;
    3537      237704 :             dfWeightSum +=
    3538      237713 :                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
    3539             :         }
    3540      227791 :         for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
    3541             :         {
    3542       33280 :             const double dfWeight = pfnFilterFunc(dfY);
    3543       33282 :             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
    3544       33282 :             dfWeightSum += dfWeight;
    3545             :         }
    3546             : 
    3547      194511 :         if (pabyChunkNodataMask == nullptr)
    3548             :         {
    3549      159959 :             if (dfWeightSum != 0)
    3550             :             {
    3551      159960 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3552      901423 :                 for (int i = 0; i < nSrcLineCount; ++i)
    3553      741463 :                     padfWeights[i] *= dfInvWeightSum;
    3554             :             }
    3555             :         }
    3556             : 
    3557      194511 :         if (pabyChunkNodataMask == nullptr)
    3558             :         {
    3559      159960 :             int iFilteredPixelOff = 0;  // Used after for.
    3560             :             // j used after for.
    3561      159960 :             size_t j =
    3562      159960 :                 (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
    3563             : #ifdef USE_SSE2
    3564             :             if constexpr (eWrkDataType == GDT_Float32)
    3565             :             {
    3566             : #ifdef __AVX__
    3567             :                 for (; iFilteredPixelOff + 15 < nDstXSize;
    3568             :                      iFilteredPixelOff += 16, j += 16)
    3569             :                 {
    3570             :                     GDALResampleConvolutionVertical_16cols(
    3571             :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3572             :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3573             :                     if (bHasNoData)
    3574             :                     {
    3575             :                         for (int k = 0; k < 16; k++)
    3576             :                         {
    3577             :                             pafDstScanline[iFilteredPixelOff + k] =
    3578             :                                 replaceValIfNodata(
    3579             :                                     pafDstScanline[iFilteredPixelOff + k]);
    3580             :                         }
    3581             :                     }
    3582             :                 }
    3583             : #else
    3584    18784486 :                 for (; iFilteredPixelOff + 7 < nDstXSize;
    3585             :                      iFilteredPixelOff += 8, j += 8)
    3586             :                 {
    3587    18640780 :                     GDALResampleConvolutionVertical_8cols(
    3588    18640780 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3589    18640780 :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3590    18631800 :                     if (bHasNoData)
    3591             :                     {
    3592       17820 :                         for (int k = 0; k < 8; k++)
    3593             :                         {
    3594       15840 :                             pafDstScanline[iFilteredPixelOff + k] =
    3595       15840 :                                 replaceValIfNodata(
    3596       15840 :                                     pafDstScanline[iFilteredPixelOff + k]);
    3597             :                         }
    3598             :                     }
    3599             :                 }
    3600             : #endif
    3601             : 
    3602      607250 :                 for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
    3603             :                 {
    3604      463611 :                     const Twork fVal =
    3605      463520 :                         static_cast<Twork>(GDALResampleConvolutionVertical(
    3606      463520 :                             padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3607             :                             nSrcLineCount));
    3608      463546 :                     pafDstScanline[iFilteredPixelOff] =
    3609      463611 :                         replaceValIfNodata(fVal);
    3610             :                 }
    3611             :             }
    3612             :             else
    3613             : #endif
    3614             :             {
    3615     2887210 :                 for (; iFilteredPixelOff + 1 < nDstXSize;
    3616             :                      iFilteredPixelOff += 2, j += 2)
    3617             :                 {
    3618     2880000 :                     double dfVal1 = 0.0;
    3619     2880000 :                     double dfVal2 = 0.0;
    3620     2880000 :                     GDALResampleConvolutionVertical_2cols(
    3621     2880000 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3622             :                         nSrcLineCount, dfVal1, dfVal2);
    3623     5760010 :                     pafDstScanline[iFilteredPixelOff] =
    3624     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal1));
    3625     2880000 :                     pafDstScanline[iFilteredPixelOff + 1] =
    3626     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal2));
    3627             :                 }
    3628        7206 :                 if (iFilteredPixelOff < nDstXSize)
    3629             :                 {
    3630           2 :                     const double dfVal = GDALResampleConvolutionVertical(
    3631           2 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3632             :                         nSrcLineCount);
    3633           2 :                     pafDstScanline[iFilteredPixelOff] =
    3634           2 :                         replaceValIfNodata(static_cast<Twork>(dfVal));
    3635             :                 }
    3636             :             }
    3637             :         }
    3638             :         else
    3639             :         {
    3640    17349040 :             for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
    3641             :                  ++iFilteredPixelOff)
    3642             :             {
    3643    17314505 :                 double dfVal = 0.0;
    3644    17314505 :                 dfWeightSum = 0.0;
    3645    17314505 :                 size_t j = (nSrcLineStart - nChunkYOff) *
    3646    17314505 :                                static_cast<size_t>(nDstXSize) +
    3647    17314505 :                            iFilteredPixelOff;
    3648    17314505 :                 if (bKernelWithNegativeWeights)
    3649             :                 {
    3650    17089601 :                     int nConsecutiveValid = 0;
    3651    17089601 :                     int nMaxConsecutiveValid = 0;
    3652   121806321 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3653             :                     {
    3654   104717020 :                         const double dfWeight =
    3655   104717020 :                             padfWeights[i] *
    3656             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3657   104717020 :                         if (pabyChunkNodataMaskHorizontalFiltered[j])
    3658             :                         {
    3659    42068237 :                             nConsecutiveValid++;
    3660             :                         }
    3661    62648683 :                         else if (nConsecutiveValid)
    3662             :                         {
    3663      203800 :                             nMaxConsecutiveValid = std::max(
    3664      203800 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3665      203800 :                             nConsecutiveValid = 0;
    3666             :                         }
    3667   104717020 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3668   104717020 :                         dfWeightSum += dfWeight;
    3669             :                     }
    3670    17089601 :                     nMaxConsecutiveValid =
    3671    17089601 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3672    17089601 :                     if (nMaxConsecutiveValid < nSrcLineCount / 2)
    3673             :                     {
    3674     8867341 :                         pafDstScanline[iFilteredPixelOff] =
    3675     8867249 :                             static_cast<Twork>(dfNoDataValue);
    3676     8867341 :                         continue;
    3677             :                     }
    3678             :                 }
    3679             :                 else
    3680             :                 {
    3681     1130262 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3682             :                     {
    3683      905432 :                         const double dfWeight =
    3684      905432 :                             padfWeights[i] *
    3685             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3686      905432 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3687      905432 :                         dfWeightSum += dfWeight;
    3688             :                     }
    3689             :                 }
    3690     8447134 :                 if (dfWeightSum > 0.0)
    3691             :                 {
    3692     8431093 :                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
    3693     8431081 :                         static_cast<Twork>(dfVal / dfWeightSum));
    3694             :                 }
    3695             :                 else
    3696             :                 {
    3697       16045 :                     pafDstScanline[iFilteredPixelOff] =
    3698       16021 :                         static_cast<Twork>(dfNoDataValue);
    3699             :                 }
    3700             :             }
    3701             :         }
    3702             : 
    3703      185487 :         if (fMaxVal != 0.0f)
    3704             :         {
    3705      192324 :             for (int i = 0; i < nDstXSize; ++i)
    3706             :             {
    3707      192088 :                 if (pafDstScanline[i] > fMaxVal)
    3708       96022 :                     pafDstScanline[i] = fMaxVal;
    3709             :             }
    3710             :         }
    3711             : 
    3712      185487 :         if (pafWrkScanline)
    3713             :         {
    3714      186098 :             GDALCopyWords64(pafWrkScanline, eWrkDataType, nWrkDataTypeSize,
    3715             :                             static_cast<GByte *>(pDstBuffer) +
    3716      186098 :                                 static_cast<size_t>(iDstLine - nDstYOff) *
    3717      186098 :                                     nDstXSize * nDstDataTypeSize,
    3718             :                             dstDataType, nDstDataTypeSize, nDstXSize);
    3719             :         }
    3720             :     }
    3721             : 
    3722        3703 :     VSIFree(pafWrkScanline);
    3723        3703 :     VSIFreeAligned(padfWeights);
    3724        3703 :     VSIFree(padfHorizontalFiltered);
    3725        3703 :     VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3726             : 
    3727        3703 :     return CE_None;
    3728             : }
    3729             : 
    3730             : static CPLErr
    3731        3702 : GDALResampleChunk_Convolution(const GDALOverviewResampleArgs &args,
    3732             :                               const void *pChunk, void **ppDstBuffer,
    3733             :                               GDALDataType *peDstBufferDataType)
    3734             : {
    3735             :     GDALResampleAlg eResample;
    3736        3702 :     bool bKernelWithNegativeWeights = false;
    3737        3702 :     if (EQUAL(args.pszResampling, "BILINEAR"))
    3738        2597 :         eResample = GRA_Bilinear;
    3739        1105 :     else if (EQUAL(args.pszResampling, "CUBIC"))
    3740             :     {
    3741        1027 :         eResample = GRA_Cubic;
    3742        1027 :         bKernelWithNegativeWeights = true;
    3743             :     }
    3744          78 :     else if (EQUAL(args.pszResampling, "CUBICSPLINE"))
    3745          23 :         eResample = GRA_CubicSpline;
    3746          55 :     else if (EQUAL(args.pszResampling, "LANCZOS"))
    3747             :     {
    3748          54 :         eResample = GRA_Lanczos;
    3749          54 :         bKernelWithNegativeWeights = true;
    3750             :     }
    3751             :     else
    3752             :     {
    3753           1 :         CPLAssert(false);
    3754             :         return CE_Failure;
    3755             :     }
    3756        3701 :     const int nKernelRadius = GWKGetFilterRadius(eResample);
    3757        3700 :     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
    3758             :     const FilterFunc4ValuesType pfnFilterFunc4Values =
    3759        3698 :         GWKGetFilterFunc4Values(eResample);
    3760             : 
    3761        3696 :     float fMaxVal = 0.f;
    3762             :     // Cubic, etc... can have overshoots, so make sure we clamp values to the
    3763             :     // maximum value if NBITS is set.
    3764        3696 :     if (eResample != GRA_Bilinear && args.nOvrNBITS > 0 &&
    3765           8 :         (args.eOvrDataType == GDT_Byte || args.eOvrDataType == GDT_UInt16 ||
    3766           0 :          args.eOvrDataType == GDT_UInt32))
    3767             :     {
    3768           8 :         int nBits = args.nOvrNBITS;
    3769           8 :         if (nBits == GDALGetDataTypeSize(args.eOvrDataType))
    3770           1 :             nBits = 0;
    3771           8 :         if (nBits > 0 && nBits < 32)
    3772           7 :             fMaxVal = static_cast<float>((1U << nBits) - 1);
    3773             :     }
    3774             : 
    3775        3696 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(
    3776             :         args.nDstXOff2 - args.nDstXOff, args.nDstYOff2 - args.nDstYOff,
    3777             :         GDALGetDataTypeSizeBytes(args.eOvrDataType));
    3778        3702 :     if (*ppDstBuffer == nullptr)
    3779             :     {
    3780           0 :         return CE_Failure;
    3781             :     }
    3782        3702 :     *peDstBufferDataType = args.eOvrDataType;
    3783             : 
    3784        3702 :     switch (args.eWrkDataType)
    3785             :     {
    3786        2977 :         case GDT_Byte:
    3787             :         {
    3788        2977 :             return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
    3789             :                 args, static_cast<const GByte *>(pChunk), *ppDstBuffer,
    3790             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3791        2977 :                 bKernelWithNegativeWeights, fMaxVal);
    3792             :         }
    3793             : 
    3794         395 :         case GDT_UInt16:
    3795             :         {
    3796         395 :             return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
    3797             :                 args, static_cast<const GUInt16 *>(pChunk), *ppDstBuffer,
    3798             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3799         396 :                 bKernelWithNegativeWeights, fMaxVal);
    3800             :         }
    3801             : 
    3802         301 :         case GDT_Float32:
    3803             :         {
    3804         301 :             return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
    3805             :                 args, static_cast<const float *>(pChunk), *ppDstBuffer,
    3806             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3807         301 :                 bKernelWithNegativeWeights, fMaxVal);
    3808             :         }
    3809             : 
    3810          29 :         case GDT_Float64:
    3811             :         {
    3812          29 :             return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
    3813             :                 args, static_cast<const double *>(pChunk), *ppDstBuffer,
    3814             :                 pfnFilterFunc, pfnFilterFunc4Values, nKernelRadius,
    3815          29 :                 bKernelWithNegativeWeights, fMaxVal);
    3816             :         }
    3817             : 
    3818           0 :         default:
    3819           0 :             break;
    3820             :     }
    3821             : 
    3822           0 :     CPLAssert(false);
    3823             :     return CE_Failure;
    3824             : }
    3825             : 
    3826             : /************************************************************************/
    3827             : /*                       GDALResampleChunkC32R()                        */
    3828             : /************************************************************************/
    3829             : 
    3830           2 : static CPLErr GDALResampleChunkC32R(const int nSrcWidth, const int nSrcHeight,
    3831             :                                     const float *pafChunk, const int nChunkYOff,
    3832             :                                     const int nChunkYSize, const int nDstYOff,
    3833             :                                     const int nDstYOff2, const int nOvrXSize,
    3834             :                                     const int nOvrYSize, void **ppDstBuffer,
    3835             :                                     GDALDataType *peDstBufferDataType,
    3836             :                                     const char *pszResampling)
    3837             : 
    3838             : {
    3839             :     enum Method
    3840             :     {
    3841             :         NEAR,
    3842             :         AVERAGE,
    3843             :         AVERAGE_MAGPHASE,
    3844             :         RMS,
    3845             :     };
    3846             : 
    3847           2 :     Method eMethod = NEAR;
    3848           2 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    3849             :     {
    3850           0 :         eMethod = NEAR;
    3851             :     }
    3852           2 :     else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
    3853             :     {
    3854           0 :         eMethod = AVERAGE_MAGPHASE;
    3855             :     }
    3856           2 :     else if (EQUAL(pszResampling, "RMS"))
    3857             :     {
    3858           2 :         eMethod = RMS;
    3859             :     }
    3860           0 :     else if (STARTS_WITH_CI(pszResampling, "AVER"))
    3861             :     {
    3862           0 :         eMethod = AVERAGE;
    3863             :     }
    3864             :     else
    3865             :     {
    3866           0 :         CPLError(
    3867             :             CE_Failure, CPLE_NotSupported,
    3868             :             "Resampling method %s is not supported for complex data types. "
    3869             :             "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
    3870             :             pszResampling);
    3871           0 :         return CE_Failure;
    3872             :     }
    3873             : 
    3874           2 :     const int nOXSize = nOvrXSize;
    3875           2 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
    3876             :                                        GDALGetDataTypeSizeBytes(GDT_CFloat32));
    3877           2 :     if (*ppDstBuffer == nullptr)
    3878             :     {
    3879           0 :         return CE_Failure;
    3880             :     }
    3881           2 :     float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
    3882           2 :     *peDstBufferDataType = GDT_CFloat32;
    3883             : 
    3884           2 :     const int nOYSize = nOvrYSize;
    3885           2 :     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
    3886           2 :     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
    3887             : 
    3888             :     /* ==================================================================== */
    3889             :     /*      Loop over destination scanlines.                                */
    3890             :     /* ==================================================================== */
    3891           8 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3892             :     {
    3893           6 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    3894           6 :         if (nSrcYOff < nChunkYOff)
    3895           0 :             nSrcYOff = nChunkYOff;
    3896             : 
    3897           6 :         int nSrcYOff2 =
    3898           6 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
    3899           6 :         if (nSrcYOff2 == nSrcYOff)
    3900           0 :             nSrcYOff2++;
    3901             : 
    3902           6 :         if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
    3903             :         {
    3904           2 :             if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
    3905           0 :                 nSrcYOff = nSrcHeight - 1;
    3906           2 :             nSrcYOff2 = nSrcHeight;
    3907             :         }
    3908           6 :         if (nSrcYOff2 > nChunkYOff + nChunkYSize)
    3909           0 :             nSrcYOff2 = nChunkYOff + nChunkYSize;
    3910             : 
    3911           6 :         const float *const pafSrcScanline =
    3912           6 :             pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
    3913           6 :         float *const pafDstScanline =
    3914           6 :             pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
    3915             : 
    3916             :         /* --------------------------------------------------------------------
    3917             :          */
    3918             :         /*      Loop over destination pixels */
    3919             :         /* --------------------------------------------------------------------
    3920             :          */
    3921          18 :         for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
    3922             :         {
    3923          12 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    3924          12 :             int nSrcXOff2 =
    3925          12 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
    3926          12 :             if (nSrcXOff2 == nSrcXOff)
    3927           0 :                 nSrcXOff2++;
    3928          12 :             if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
    3929             :             {
    3930           6 :                 if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
    3931           0 :                     nSrcXOff = nSrcWidth - 1;
    3932           6 :                 nSrcXOff2 = nSrcWidth;
    3933             :             }
    3934             : 
    3935          12 :             if (eMethod == NEAR)
    3936             :             {
    3937           0 :                 pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
    3938           0 :                 pafDstScanline[iDstPixel * 2 + 1] =
    3939           0 :                     pafSrcScanline[nSrcXOff * 2 + 1];
    3940             :             }
    3941          12 :             else if (eMethod == AVERAGE_MAGPHASE)
    3942             :             {
    3943           0 :                 double dfTotalR = 0.0;
    3944           0 :                 double dfTotalI = 0.0;
    3945           0 :                 double dfTotalM = 0.0;
    3946           0 :                 int nCount = 0;
    3947             : 
    3948           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    3949             :                 {
    3950           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    3951             :                     {
    3952           0 :                         const double dfR =
    3953           0 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    3954           0 :                                                         iY - nSrcYOff) *
    3955           0 :                                                         nSrcWidth * 2];
    3956           0 :                         const double dfI =
    3957           0 :                             pafSrcScanline[iX * 2 +
    3958           0 :                                            static_cast<GPtrDiff_t>(iY -
    3959           0 :                                                                    nSrcYOff) *
    3960           0 :                                                nSrcWidth * 2 +
    3961           0 :                                            1];
    3962           0 :                         dfTotalR += dfR;
    3963           0 :                         dfTotalI += dfI;
    3964           0 :                         dfTotalM += std::hypot(dfR, dfI);
    3965           0 :                         ++nCount;
    3966             :                     }
    3967             :                 }
    3968             : 
    3969           0 :                 CPLAssert(nCount > 0);
    3970           0 :                 if (nCount == 0)
    3971             :                 {
    3972           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    3973           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    3974             :                 }
    3975             :                 else
    3976             :                 {
    3977           0 :                     pafDstScanline[iDstPixel * 2] =
    3978           0 :                         static_cast<float>(dfTotalR / nCount);
    3979           0 :                     pafDstScanline[iDstPixel * 2 + 1] =
    3980           0 :                         static_cast<float>(dfTotalI / nCount);
    3981             :                     const double dfM =
    3982           0 :                         std::hypot(pafDstScanline[iDstPixel * 2],
    3983           0 :                                    pafDstScanline[iDstPixel * 2 + 1]);
    3984           0 :                     const double dfDesiredM = dfTotalM / nCount;
    3985           0 :                     double dfRatio = 1.0;
    3986           0 :                     if (dfM != 0.0)
    3987           0 :                         dfRatio = dfDesiredM / dfM;
    3988             : 
    3989           0 :                     pafDstScanline[iDstPixel * 2] *=
    3990           0 :                         static_cast<float>(dfRatio);
    3991           0 :                     pafDstScanline[iDstPixel * 2 + 1] *=
    3992           0 :                         static_cast<float>(dfRatio);
    3993             :                 }
    3994             :             }
    3995          12 :             else if (eMethod == RMS)
    3996             :             {
    3997          12 :                 double dfTotalR = 0.0;
    3998          12 :                 double dfTotalI = 0.0;
    3999          12 :                 int nCount = 0;
    4000             : 
    4001          36 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4002             :                 {
    4003          72 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4004             :                     {
    4005          48 :                         const double dfR =
    4006          48 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    4007          48 :                                                         iY - nSrcYOff) *
    4008          48 :                                                         nSrcWidth * 2];
    4009          48 :                         const double dfI =
    4010          48 :                             pafSrcScanline[iX * 2 +
    4011          48 :                                            static_cast<GPtrDiff_t>(iY -
    4012          48 :                                                                    nSrcYOff) *
    4013          48 :                                                nSrcWidth * 2 +
    4014          48 :                                            1];
    4015             : 
    4016          48 :                         dfTotalR += SQUARE(dfR);
    4017          48 :                         dfTotalI += SQUARE(dfI);
    4018             : 
    4019          48 :                         ++nCount;
    4020             :                     }
    4021             :                 }
    4022             : 
    4023          12 :                 CPLAssert(nCount > 0);
    4024          12 :                 if (nCount == 0)
    4025             :                 {
    4026           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    4027           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    4028             :                 }
    4029             :                 else
    4030             :                 {
    4031             :                     /* compute RMS */
    4032          12 :                     pafDstScanline[iDstPixel * 2] =
    4033          12 :                         static_cast<float>(sqrt(dfTotalR / nCount));
    4034          12 :                     pafDstScanline[iDstPixel * 2 + 1] =
    4035          12 :                         static_cast<float>(sqrt(dfTotalI / nCount));
    4036             :                 }
    4037             :             }
    4038           0 :             else if (eMethod == AVERAGE)
    4039             :             {
    4040           0 :                 double dfTotalR = 0.0;
    4041           0 :                 double dfTotalI = 0.0;
    4042           0 :                 int nCount = 0;
    4043             : 
    4044           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    4045             :                 {
    4046           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    4047             :                     {
    4048             :                         // TODO(schwehr): Maybe use std::complex?
    4049           0 :                         dfTotalR +=
    4050           0 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    4051           0 :                                                         iY - nSrcYOff) *
    4052           0 :                                                         nSrcWidth * 2];
    4053           0 :                         dfTotalI += pafSrcScanline[iX * 2 +
    4054           0 :                                                    static_cast<GPtrDiff_t>(
    4055           0 :                                                        iY - nSrcYOff) *
    4056           0 :                                                        nSrcWidth * 2 +
    4057           0 :                                                    1];
    4058           0 :                         ++nCount;
    4059             :                     }
    4060             :                 }
    4061             : 
    4062           0 :                 CPLAssert(nCount > 0);
    4063           0 :                 if (nCount == 0)
    4064             :                 {
    4065           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    4066           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    4067             :                 }
    4068             :                 else
    4069             :                 {
    4070           0 :                     pafDstScanline[iDstPixel * 2] =
    4071           0 :                         static_cast<float>(dfTotalR / nCount);
    4072           0 :                     pafDstScanline[iDstPixel * 2 + 1] =
    4073           0 :                         static_cast<float>(dfTotalI / nCount);
    4074             :                 }
    4075             :             }
    4076             :         }
    4077             :     }
    4078             : 
    4079           2 :     return CE_None;
    4080             : }
    4081             : 
    4082             : /************************************************************************/
    4083             : /*                  GDALRegenerateCascadingOverviews()                  */
    4084             : /*                                                                      */
    4085             : /*      Generate a list of overviews in order from largest to           */
    4086             : /*      smallest, computing each from the next larger.                  */
    4087             : /************************************************************************/
    4088             : 
    4089          42 : static CPLErr GDALRegenerateCascadingOverviews(
    4090             :     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
    4091             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    4092             :     void *pProgressData, CSLConstList papszOptions)
    4093             : 
    4094             : {
    4095             :     /* -------------------------------------------------------------------- */
    4096             :     /*      First, we must put the overviews in order from largest to       */
    4097             :     /*      smallest.                                                       */
    4098             :     /* -------------------------------------------------------------------- */
    4099         120 :     for (int i = 0; i < nOverviews - 1; ++i)
    4100             :     {
    4101         270 :         for (int j = 0; j < nOverviews - i - 1; ++j)
    4102             :         {
    4103         192 :             if (papoOvrBands[j]->GetXSize() *
    4104         192 :                     static_cast<float>(papoOvrBands[j]->GetYSize()) <
    4105         192 :                 papoOvrBands[j + 1]->GetXSize() *
    4106         192 :                     static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
    4107             :             {
    4108           0 :                 GDALRasterBand *poTempBand = papoOvrBands[j];
    4109           0 :                 papoOvrBands[j] = papoOvrBands[j + 1];
    4110           0 :                 papoOvrBands[j + 1] = poTempBand;
    4111             :             }
    4112             :         }
    4113             :     }
    4114             : 
    4115             :     /* -------------------------------------------------------------------- */
    4116             :     /*      Count total pixels so we can prepare appropriate scaled         */
    4117             :     /*      progress functions.                                             */
    4118             :     /* -------------------------------------------------------------------- */
    4119          42 :     double dfTotalPixels = 0.0;
    4120             : 
    4121         162 :     for (int i = 0; i < nOverviews; ++i)
    4122             :     {
    4123         120 :         dfTotalPixels += papoOvrBands[i]->GetXSize() *
    4124         120 :                          static_cast<double>(papoOvrBands[i]->GetYSize());
    4125             :     }
    4126             : 
    4127             :     /* -------------------------------------------------------------------- */
    4128             :     /*      Generate all the bands.                                         */
    4129             :     /* -------------------------------------------------------------------- */
    4130          42 :     double dfPixelsProcessed = 0.0;
    4131             : 
    4132         162 :     for (int i = 0; i < nOverviews; ++i)
    4133             :     {
    4134         120 :         GDALRasterBand *poBaseBand = poSrcBand;
    4135         120 :         if (i != 0)
    4136          78 :             poBaseBand = papoOvrBands[i - 1];
    4137             : 
    4138         120 :         double dfPixels = papoOvrBands[i]->GetXSize() *
    4139         120 :                           static_cast<double>(papoOvrBands[i]->GetYSize());
    4140             : 
    4141         240 :         void *pScaledProgressData = GDALCreateScaledProgress(
    4142             :             dfPixelsProcessed / dfTotalPixels,
    4143         120 :             (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
    4144             :             pProgressData);
    4145             : 
    4146         240 :         const CPLErr eErr = GDALRegenerateOverviewsEx(
    4147             :             poBaseBand, 1,
    4148         120 :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
    4149             :             pszResampling, GDALScaledProgress, pScaledProgressData,
    4150             :             papszOptions);
    4151         120 :         GDALDestroyScaledProgress(pScaledProgressData);
    4152             : 
    4153         120 :         if (eErr != CE_None)
    4154           0 :             return eErr;
    4155             : 
    4156         120 :         dfPixelsProcessed += dfPixels;
    4157             : 
    4158             :         // Only do the bit2grayscale promotion on the base band.
    4159         120 :         if (STARTS_WITH_CI(pszResampling,
    4160             :                            "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
    4161           8 :             pszResampling = "AVERAGE";
    4162             :     }
    4163             : 
    4164          42 :     return CE_None;
    4165             : }
    4166             : 
    4167             : /************************************************************************/
    4168             : /*                    GDALGetResampleFunction()                         */
    4169             : /************************************************************************/
    4170             : 
    4171        3884 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
    4172             :                                              int *pnRadius)
    4173             : {
    4174        3884 :     if (pnRadius)
    4175        3885 :         *pnRadius = 0;
    4176        3884 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4177         444 :         return GDALResampleChunk_Near;
    4178        3440 :     else if (STARTS_WITH_CI(pszResampling, "AVER") ||
    4179        2915 :              EQUAL(pszResampling, "RMS"))
    4180         552 :         return GDALResampleChunk_AverageOrRMS;
    4181        2888 :     else if (EQUAL(pszResampling, "GAUSS"))
    4182             :     {
    4183          26 :         if (pnRadius)
    4184          26 :             *pnRadius = 1;
    4185          26 :         return GDALResampleChunk_Gauss;
    4186             :     }
    4187        2862 :     else if (EQUAL(pszResampling, "MODE"))
    4188          96 :         return GDALResampleChunk_Mode;
    4189        2766 :     else if (EQUAL(pszResampling, "CUBIC"))
    4190             :     {
    4191         396 :         if (pnRadius)
    4192         396 :             *pnRadius = GWKGetFilterRadius(GRA_Cubic);
    4193         389 :         return GDALResampleChunk_Convolution;
    4194             :     }
    4195        2370 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    4196             :     {
    4197           3 :         if (pnRadius)
    4198           3 :             *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
    4199           3 :         return GDALResampleChunk_Convolution;
    4200             :     }
    4201        2367 :     else if (EQUAL(pszResampling, "LANCZOS"))
    4202             :     {
    4203           8 :         if (pnRadius)
    4204           8 :             *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
    4205           8 :         return GDALResampleChunk_Convolution;
    4206             :     }
    4207        2359 :     else if (EQUAL(pszResampling, "BILINEAR"))
    4208             :     {
    4209        2367 :         if (pnRadius)
    4210        2367 :             *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
    4211        2367 :         return GDALResampleChunk_Convolution;
    4212             :     }
    4213             :     else
    4214             :     {
    4215           0 :         CPLError(
    4216             :             CE_Failure, CPLE_AppDefined,
    4217             :             "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
    4218             :             pszResampling);
    4219           0 :         return nullptr;
    4220             :     }
    4221             : }
    4222             : 
    4223             : /************************************************************************/
    4224             : /*                      GDALGetOvrWorkDataType()                        */
    4225             : /************************************************************************/
    4226             : 
    4227        3774 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
    4228             :                                     GDALDataType eSrcDataType)
    4229             : {
    4230        3774 :     if (STARTS_WITH_CI(pszResampling, "NEAR") || EQUAL(pszResampling, "MODE"))
    4231             :     {
    4232         533 :         return eSrcDataType;
    4233             :     }
    4234        3241 :     else if (eSrcDataType == GDT_Byte &&
    4235        2931 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4236        2469 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4237        2257 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4238        2254 :               EQUAL(pszResampling, "LANCZOS") ||
    4239        2249 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4240             :     {
    4241        2928 :         return GDT_Byte;
    4242             :     }
    4243         313 :     else if (eSrcDataType == GDT_UInt16 &&
    4244         118 :              (STARTS_WITH_CI(pszResampling, "AVER") ||
    4245         107 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4246           3 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4247           3 :               EQUAL(pszResampling, "LANCZOS") ||
    4248           2 :               EQUAL(pszResampling, "BILINEAR") || EQUAL(pszResampling, "MODE")))
    4249             :     {
    4250         110 :         return GDT_UInt16;
    4251             :     }
    4252         203 :     else if (EQUAL(pszResampling, "GAUSS"))
    4253          20 :         return GDT_Float64;
    4254             : 
    4255         183 :     if (eSrcDataType == GDT_Byte || eSrcDataType == GDT_Int8 ||
    4256         184 :         eSrcDataType == GDT_UInt16 || eSrcDataType == GDT_Int16 ||
    4257             :         eSrcDataType == GDT_Float32)
    4258             :     {
    4259         147 :         return GDT_Float32;
    4260             :     }
    4261          36 :     return GDT_Float64;
    4262             : }
    4263             : 
    4264             : namespace
    4265             : {
    4266             : // Structure to hold a pointer to free with CPLFree()
    4267             : struct PointerHolder
    4268             : {
    4269             :     void *ptr = nullptr;
    4270             : 
    4271       34757 :     explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
    4272             :     {
    4273       34757 :     }
    4274             : 
    4275       34759 :     ~PointerHolder()
    4276       34759 :     {
    4277       34759 :         CPLFree(ptr);
    4278       34759 :     }
    4279             : 
    4280             :     PointerHolder(const PointerHolder &) = delete;
    4281             :     PointerHolder &operator=(const PointerHolder &) = delete;
    4282             : };
    4283             : }  // namespace
    4284             : 
    4285             : /************************************************************************/
    4286             : /*                      GDALRegenerateOverviews()                       */
    4287             : /************************************************************************/
    4288             : 
    4289             : /**
    4290             :  * \brief Generate downsampled overviews.
    4291             :  *
    4292             :  * This function will generate one or more overview images from a base image
    4293             :  * using the requested downsampling algorithm.  Its primary use is for
    4294             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4295             :  * used to generate downsampled images in one file from another outside the
    4296             :  * overview architecture.
    4297             :  *
    4298             :  * The output bands need to exist in advance.
    4299             :  *
    4300             :  * The full set of resampling algorithms is documented in
    4301             :  * GDALDataset::BuildOverviews().
    4302             :  *
    4303             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4304             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4305             :  * considered as the nodata value and not each value of the triplet
    4306             :  * independently per band.
    4307             :  *
    4308             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4309             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4310             :  * overview computation.
    4311             :  *
    4312             :  * @param hSrcBand the source (base level) band.
    4313             :  * @param nOverviewCount the number of downsampled bands being generated.
    4314             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4315             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4316             :  * @param pfnProgress progress report function.
    4317             :  * @param pProgressData progress function callback data.
    4318             :  * @return CE_None on success or CE_Failure on failure.
    4319             :  */
    4320         252 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
    4321             :                                GDALRasterBandH *pahOvrBands,
    4322             :                                const char *pszResampling,
    4323             :                                GDALProgressFunc pfnProgress,
    4324             :                                void *pProgressData)
    4325             : 
    4326             : {
    4327         252 :     return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
    4328             :                                      pszResampling, pfnProgress, pProgressData,
    4329         252 :                                      nullptr);
    4330             : }
    4331             : 
    4332             : /************************************************************************/
    4333             : /*                     GDALRegenerateOverviewsEx()                      */
    4334             : /************************************************************************/
    4335             : 
    4336             : /**
    4337             :  * \brief Generate downsampled overviews.
    4338             :  *
    4339             :  * This function will generate one or more overview images from a base image
    4340             :  * using the requested downsampling algorithm.  Its primary use is for
    4341             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4342             :  * used to generate downsampled images in one file from another outside the
    4343             :  * overview architecture.
    4344             :  *
    4345             :  * The output bands need to exist in advance.
    4346             :  *
    4347             :  * The full set of resampling algorithms is documented in
    4348             :  * GDALDataset::BuildOverviews().
    4349             :  *
    4350             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4351             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4352             :  * considered as the nodata value and not each value of the triplet
    4353             :  * independently per band.
    4354             :  *
    4355             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4356             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4357             :  * overview computation.
    4358             :  *
    4359             :  * @param hSrcBand the source (base level) band.
    4360             :  * @param nOverviewCount the number of downsampled bands being generated.
    4361             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4362             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4363             :  * @param pfnProgress progress report function.
    4364             :  * @param pProgressData progress function callback data.
    4365             :  * @param papszOptions NULL terminated list of options as key=value pairs, or
    4366             :  * NULL
    4367             :  * @return CE_None on success or CE_Failure on failure.
    4368             :  * @since GDAL 3.6
    4369             :  */
    4370         815 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
    4371             :                                  GDALRasterBandH *pahOvrBands,
    4372             :                                  const char *pszResampling,
    4373             :                                  GDALProgressFunc pfnProgress,
    4374             :                                  void *pProgressData, CSLConstList papszOptions)
    4375             : 
    4376             : {
    4377         815 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4378         815 :     GDALRasterBand **papoOvrBands =
    4379             :         reinterpret_cast<GDALRasterBand **>(pahOvrBands);
    4380             : 
    4381         815 :     if (pfnProgress == nullptr)
    4382         252 :         pfnProgress = GDALDummyProgress;
    4383             : 
    4384         815 :     if (EQUAL(pszResampling, "NONE"))
    4385          61 :         return CE_None;
    4386             : 
    4387         754 :     int nKernelRadius = 0;
    4388             :     GDALResampleFunction pfnResampleFn =
    4389         754 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4390             : 
    4391         754 :     if (pfnResampleFn == nullptr)
    4392           0 :         return CE_Failure;
    4393             : 
    4394             :     /* -------------------------------------------------------------------- */
    4395             :     /*      Check color tables...                                           */
    4396             :     /* -------------------------------------------------------------------- */
    4397         754 :     GDALColorTable *poColorTable = nullptr;
    4398             : 
    4399         391 :     if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4400        1582 :          EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
    4401         448 :         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4402             :     {
    4403           9 :         poColorTable = poSrcBand->GetColorTable();
    4404           9 :         if (poColorTable != nullptr)
    4405             :         {
    4406           9 :             if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
    4407             :             {
    4408           0 :                 CPLError(CE_Warning, CPLE_AppDefined,
    4409             :                          "Computing overviews on palette index raster bands "
    4410             :                          "with a palette whose color interpretation is not RGB "
    4411             :                          "will probably lead to unexpected results.");
    4412           0 :                 poColorTable = nullptr;
    4413             :             }
    4414           9 :             else if (poColorTable->IsIdentity())
    4415             :             {
    4416           0 :                 poColorTable = nullptr;
    4417             :             }
    4418             :         }
    4419             :         else
    4420             :         {
    4421           0 :             CPLError(CE_Warning, CPLE_AppDefined,
    4422             :                      "Computing overviews on palette index raster bands "
    4423             :                      "without a palette will probably lead to unexpected "
    4424             :                      "results.");
    4425             :         }
    4426             :     }
    4427             :     // Not ready yet
    4428        2181 :     else if ((EQUAL(pszResampling, "CUBIC") ||
    4429         691 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4430         691 :               EQUAL(pszResampling, "LANCZOS") ||
    4431        1493 :               EQUAL(pszResampling, "BILINEAR")) &&
    4432          57 :              poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4433             :     {
    4434           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4435             :                  "Computing %s overviews on palette index raster bands "
    4436             :                  "will probably lead to unexpected results.",
    4437             :                  pszResampling);
    4438             :     }
    4439             : 
    4440             :     // If we have a nodata mask and we are doing something more complicated
    4441             :     // than nearest neighbouring, we have to fetch to nodata mask.
    4442             : 
    4443         754 :     GDALRasterBand *poMaskBand = nullptr;
    4444         754 :     bool bUseNoDataMask = false;
    4445         754 :     bool bCanUseCascaded = true;
    4446             : 
    4447         754 :     if (!STARTS_WITH_CI(pszResampling, "NEAR"))
    4448             :     {
    4449             :         // Special case if we are an alpha/mask band. We want it to be
    4450             :         // considered as the mask band to avoid alpha=0 to be taken into account
    4451             :         // in average computation.
    4452         505 :         if (poSrcBand->IsMaskBand())
    4453             :         {
    4454          90 :             poMaskBand = poSrcBand;
    4455          90 :             bUseNoDataMask = true;
    4456             :         }
    4457             :         else
    4458             :         {
    4459         415 :             poMaskBand = poSrcBand->GetMaskBand();
    4460         415 :             const int nMaskFlags = poSrcBand->GetMaskFlags();
    4461         415 :             bCanUseCascaded =
    4462         415 :                 (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
    4463         415 :             bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
    4464             :         }
    4465             :     }
    4466             : 
    4467             :     /* -------------------------------------------------------------------- */
    4468             :     /*      If we are operating on multiple overviews, and using            */
    4469             :     /*      averaging, lets do them in cascading order to reduce the        */
    4470             :     /*      amount of computation.                                          */
    4471             :     /* -------------------------------------------------------------------- */
    4472             : 
    4473             :     // In case the mask made be computed from another band of the dataset,
    4474             :     // we can't use cascaded generation, as the computation of the overviews
    4475             :     // of the band used for the mask band may not have yet occurred (#3033).
    4476         754 :     if ((STARTS_WITH_CI(pszResampling, "AVER") ||
    4477         391 :          EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
    4478         360 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    4479         306 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    4480         754 :          EQUAL(pszResampling, "MODE")) &&
    4481          42 :         nOverviewCount > 1 && bCanUseCascaded)
    4482          42 :         return GDALRegenerateCascadingOverviews(
    4483             :             poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
    4484          42 :             pProgressData, papszOptions);
    4485             : 
    4486             :     /* -------------------------------------------------------------------- */
    4487             :     /*      Setup one horizontal swath to read from the raw buffer.         */
    4488             :     /* -------------------------------------------------------------------- */
    4489         712 :     int nFRXBlockSize = 0;
    4490         712 :     int nFRYBlockSize = 0;
    4491         712 :     poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
    4492             : 
    4493         712 :     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
    4494        1175 :     const bool bUseGenericResampleFn = STARTS_WITH_CI(pszResampling, "NEAR") ||
    4495        1129 :                                        EQUAL(pszResampling, "MODE") ||
    4496         417 :                                        !GDALDataTypeIsComplex(eSrcDataType);
    4497             :     const GDALDataType eWrkDataType =
    4498             :         bUseGenericResampleFn
    4499         712 :             ? GDALGetOvrWorkDataType(pszResampling, eSrcDataType)
    4500         712 :             : GDT_CFloat32;
    4501             : 
    4502         712 :     const int nWidth = poSrcBand->GetXSize();
    4503         712 :     const int nHeight = poSrcBand->GetYSize();
    4504             : 
    4505         712 :     int nMaxOvrFactor = 1;
    4506        1505 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    4507             :     {
    4508         793 :         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
    4509         793 :         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
    4510         793 :         nMaxOvrFactor = std::max(
    4511             :             nMaxOvrFactor,
    4512         793 :             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
    4513         793 :         nMaxOvrFactor = std::max(
    4514             :             nMaxOvrFactor,
    4515         793 :             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
    4516             :     }
    4517             : 
    4518         712 :     int nFullResYChunk = nFRYBlockSize;
    4519         712 :     int nMaxChunkYSizeQueried = 0;
    4520             : 
    4521             :     const auto UpdateChunkHeightAndGetChunkSize =
    4522        9267 :         [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
    4523       27801 :          eWrkDataType, nWidth]()
    4524             :     {
    4525             :         // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
    4526             :         // + nFullResYChunk) / nMaxOvrFactor)
    4527        9267 :         nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
    4528        9267 :         nMaxChunkYSizeQueried =
    4529        9267 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    4530        9267 :         return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
    4531        9267 :                nMaxChunkYSizeQueried * nWidth;
    4532         712 :     };
    4533             : 
    4534             :     // Only configurable for debug / testing
    4535             :     const char *pszChunkYSize =
    4536         712 :         CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
    4537         712 :     if (pszChunkYSize)
    4538             :     {
    4539             :         // coverity[tainted_data]
    4540           0 :         nFullResYChunk = atoi(pszChunkYSize);
    4541             :     }
    4542             : 
    4543             :     // Only configurable for debug / testing
    4544             :     const int nChunkMaxSize =
    4545         712 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    4546             : 
    4547         712 :     auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4548         712 :     if (nChunkSize > nChunkMaxSize)
    4549             :     {
    4550           3 :         if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
    4551           9 :             !GDALDataTypeIsComplex(eSrcDataType) &&
    4552           3 :             (!STARTS_WITH_CI(pszResampling, "AVER") ||
    4553           0 :              EQUAL(pszResampling, "AVERAGE")))
    4554             :         {
    4555             :             // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
    4556             :             // which use a block based strategy, which is much less memory
    4557             :             // hungry.
    4558           3 :             return GDALRegenerateOverviewsMultiBand(
    4559             :                 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
    4560           3 :                 pfnProgress, pProgressData, papszOptions);
    4561             :         }
    4562           0 :         else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
    4563             :         {
    4564           0 :             return GDALRegenerateCascadingOverviews(
    4565             :                 poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
    4566           0 :                 pfnProgress, pProgressData, papszOptions);
    4567             :         }
    4568             :     }
    4569         709 :     else if (pszChunkYSize == nullptr)
    4570             :     {
    4571             :         // Try to get as close as possible to nChunkMaxSize
    4572        9264 :         while (nChunkSize * 2 < nChunkMaxSize)
    4573             :         {
    4574        8555 :             nFullResYChunk *= 2;
    4575        8555 :             nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4576             :         }
    4577             :     }
    4578             : 
    4579         709 :     int nHasNoData = 0;
    4580         709 :     const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
    4581         709 :     const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    4582             :     const bool bPropagateNoData =
    4583         709 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    4584             : 
    4585             :     // Structure describing a resampling job
    4586             :     struct OvrJob
    4587             :     {
    4588             :         // Buffers to free when job is finished
    4589             :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
    4590             :         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
    4591             :         std::unique_ptr<PointerHolder> oDstBufferHolder{};
    4592             : 
    4593             :         GDALRasterBand *poDstBand = nullptr;
    4594             : 
    4595             :         // Input parameters of pfnResampleFn
    4596             :         GDALResampleFunction pfnResampleFn = nullptr;
    4597             :         int nSrcWidth = 0;
    4598             :         int nSrcHeight = 0;
    4599             :         int nDstWidth = 0;
    4600             :         GDALOverviewResampleArgs args{};
    4601             :         const void *pChunk = nullptr;
    4602             :         bool bUseGenericResampleFn = false;
    4603             : 
    4604             :         // Output values of resampling function
    4605             :         CPLErr eErr = CE_Failure;
    4606             :         void *pDstBuffer = nullptr;
    4607             :         GDALDataType eDstBufferDataType = GDT_Unknown;
    4608             : 
    4609             :         // Synchronization
    4610             :         bool bFinished = false;
    4611             :         std::mutex mutex{};
    4612             :         std::condition_variable cv{};
    4613             : 
    4614           0 :         void SetSrcMaskBufferHolder(
    4615             :             const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
    4616             :         {
    4617           0 :             oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
    4618           0 :         }
    4619             : 
    4620           0 :         void SetSrcBufferHolder(
    4621             :             const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
    4622             :         {
    4623           0 :             oSrcBufferHolder = oSrcBufferHolderIn;
    4624           0 :         }
    4625             :     };
    4626             : 
    4627             :     // Thread function to resample
    4628         791 :     const auto JobResampleFunc = [](void *pData)
    4629             :     {
    4630         791 :         OvrJob *poJob = static_cast<OvrJob *>(pData);
    4631             : 
    4632         791 :         if (poJob->bUseGenericResampleFn)
    4633             :         {
    4634         789 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    4635             :                                                &(poJob->pDstBuffer),
    4636             :                                                &(poJob->eDstBufferDataType));
    4637             :         }
    4638             :         else
    4639             :         {
    4640           2 :             poJob->eErr = GDALResampleChunkC32R(
    4641             :                 poJob->nSrcWidth, poJob->nSrcHeight,
    4642           2 :                 static_cast<const float *>(poJob->pChunk),
    4643             :                 poJob->args.nChunkYOff, poJob->args.nChunkYSize,
    4644             :                 poJob->args.nDstYOff, poJob->args.nDstYOff2,
    4645             :                 poJob->args.nOvrXSize, poJob->args.nOvrYSize,
    4646             :                 &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
    4647             :                 poJob->args.pszResampling);
    4648             :         }
    4649             : 
    4650             :         poJob->oDstBufferHolder =
    4651         791 :             std::make_unique<PointerHolder>(poJob->pDstBuffer);
    4652             : 
    4653             :         {
    4654        1582 :             std::lock_guard<std::mutex> guard(poJob->mutex);
    4655         791 :             poJob->bFinished = true;
    4656         791 :             poJob->cv.notify_one();
    4657             :         }
    4658         791 :     };
    4659             : 
    4660             :     // Function to write resample data to target band
    4661         791 :     const auto WriteJobData = [](const OvrJob *poJob)
    4662             :     {
    4663        1582 :         return poJob->poDstBand->RasterIO(
    4664         791 :             GF_Write, 0, poJob->args.nDstYOff, poJob->nDstWidth,
    4665         791 :             poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    4666         791 :             poJob->nDstWidth, poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    4667         791 :             poJob->eDstBufferDataType, 0, 0, nullptr);
    4668             :     };
    4669             : 
    4670             :     // Wait for completion of oldest job and serialize it
    4671             :     const auto WaitAndFinalizeOldestJob =
    4672           0 :         [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    4673             :     {
    4674           0 :         auto poOldestJob = jobList.front().get();
    4675             :         {
    4676           0 :             std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
    4677             :             // coverity[missing_lock:FALSE]
    4678           0 :             while (!poOldestJob->bFinished)
    4679             :             {
    4680           0 :                 poOldestJob->cv.wait(oGuard);
    4681             :             }
    4682             :         }
    4683           0 :         CPLErr l_eErr = poOldestJob->eErr;
    4684           0 :         if (l_eErr == CE_None)
    4685             :         {
    4686           0 :             l_eErr = WriteJobData(poOldestJob);
    4687             :         }
    4688             : 
    4689           0 :         jobList.pop_front();
    4690           0 :         return l_eErr;
    4691             :     };
    4692             : 
    4693             :     // Queue of jobs
    4694        1418 :     std::list<std::unique_ptr<OvrJob>> jobList;
    4695             : 
    4696         709 :     GByte *pabyChunkNodataMask = nullptr;
    4697         709 :     void *pChunk = nullptr;
    4698             : 
    4699         709 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    4700        2836 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    4701         709 :                                                        ? CPLGetNumCPUs()
    4702         709 :                                                        : atoi(pszThreads)));
    4703             :     auto poThreadPool =
    4704         709 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    4705             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    4706        1418 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    4707             : 
    4708             :     /* -------------------------------------------------------------------- */
    4709             :     /*      Loop over image operating on chunks.                            */
    4710             :     /* -------------------------------------------------------------------- */
    4711         709 :     int nChunkYOff = 0;
    4712         709 :     CPLErr eErr = CE_None;
    4713             : 
    4714        1423 :     for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
    4715         714 :          nChunkYOff += nFullResYChunk)
    4716             :     {
    4717         714 :         if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
    4718             :                          pProgressData))
    4719             :         {
    4720           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    4721           0 :             eErr = CE_Failure;
    4722             :         }
    4723             : 
    4724         714 :         if (nFullResYChunk + nChunkYOff > nHeight)
    4725         707 :             nFullResYChunk = nHeight - nChunkYOff;
    4726             : 
    4727         714 :         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
    4728         714 :         int nChunkYSizeQueried =
    4729         714 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    4730         714 :         if (nChunkYOffQueried < 0)
    4731             :         {
    4732          62 :             nChunkYSizeQueried += nChunkYOffQueried;
    4733          62 :             nChunkYOffQueried = 0;
    4734             :         }
    4735         714 :         if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
    4736          62 :             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
    4737             : 
    4738             :         // Avoid accumulating too many tasks and exhaust RAM
    4739             :         // Try to complete already finished jobs
    4740         714 :         while (eErr == CE_None && !jobList.empty())
    4741             :         {
    4742           0 :             auto poOldestJob = jobList.front().get();
    4743             :             {
    4744           0 :                 std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
    4745           0 :                 if (!poOldestJob->bFinished)
    4746             :                 {
    4747           0 :                     break;
    4748             :                 }
    4749             :             }
    4750           0 :             eErr = poOldestJob->eErr;
    4751           0 :             if (eErr == CE_None)
    4752             :             {
    4753           0 :                 eErr = WriteJobData(poOldestJob);
    4754             :             }
    4755             : 
    4756           0 :             jobList.pop_front();
    4757             :         }
    4758             : 
    4759             :         // And in case we have saturated the number of threads,
    4760             :         // wait for completion of tasks to go below the threshold.
    4761        1428 :         while (eErr == CE_None &&
    4762         714 :                jobList.size() >= static_cast<size_t>(nThreads))
    4763             :         {
    4764           0 :             eErr = WaitAndFinalizeOldestJob(jobList);
    4765             :         }
    4766             : 
    4767             :         // (Re)allocate buffers if needed
    4768         714 :         if (pChunk == nullptr)
    4769             :         {
    4770         709 :             pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    4771             :                                          nMaxChunkYSizeQueried, nWidth);
    4772             :         }
    4773         714 :         if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
    4774             :         {
    4775             :             pabyChunkNodataMask = static_cast<GByte *>(
    4776         274 :                 VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
    4777             :         }
    4778             : 
    4779         714 :         if (pChunk == nullptr ||
    4780         274 :             (bUseNoDataMask && pabyChunkNodataMask == nullptr))
    4781             :         {
    4782           0 :             CPLFree(pChunk);
    4783           0 :             CPLFree(pabyChunkNodataMask);
    4784           0 :             return CE_Failure;
    4785             :         }
    4786             : 
    4787             :         // Read chunk.
    4788         714 :         if (eErr == CE_None)
    4789         714 :             eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4790             :                                        nChunkYSizeQueried, pChunk, nWidth,
    4791             :                                        nChunkYSizeQueried, eWrkDataType, 0, 0,
    4792             :                                        nullptr);
    4793         714 :         if (eErr == CE_None && bUseNoDataMask)
    4794         274 :             eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4795             :                                         nChunkYSizeQueried, pabyChunkNodataMask,
    4796             :                                         nWidth, nChunkYSizeQueried, GDT_Byte, 0,
    4797             :                                         0, nullptr);
    4798             : 
    4799             :         // Special case to promote 1bit data to 8bit 0/255 values.
    4800         714 :         if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
    4801             :         {
    4802           9 :             if (eWrkDataType == GDT_Float32)
    4803             :             {
    4804           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4805           0 :                 for (GPtrDiff_t i = 0;
    4806           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4807             :                      i++)
    4808             :                 {
    4809           0 :                     if (pafChunk[i] == 1.0)
    4810           0 :                         pafChunk[i] = 255.0;
    4811             :                 }
    4812             :             }
    4813           9 :             else if (eWrkDataType == GDT_Byte)
    4814             :             {
    4815           9 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4816      168417 :                 for (GPtrDiff_t i = 0;
    4817      168417 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4818             :                      i++)
    4819             :                 {
    4820      168408 :                     if (pabyChunk[i] == 1)
    4821      127437 :                         pabyChunk[i] = 255;
    4822             :                 }
    4823             :             }
    4824           0 :             else if (eWrkDataType == GDT_UInt16)
    4825             :             {
    4826           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4827           0 :                 for (GPtrDiff_t i = 0;
    4828           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4829             :                      i++)
    4830             :                 {
    4831           0 :                     if (pasChunk[i] == 1)
    4832           0 :                         pasChunk[i] = 255;
    4833             :                 }
    4834             :             }
    4835           0 :             else if (eWrkDataType == GDT_Float64)
    4836             :             {
    4837           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4838           0 :                 for (GPtrDiff_t i = 0;
    4839           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4840             :                      i++)
    4841             :                 {
    4842           0 :                     if (padfChunk[i] == 1.0)
    4843           0 :                         padfChunk[i] = 255.0;
    4844             :                 }
    4845             :             }
    4846             :             else
    4847             :             {
    4848           0 :                 CPLAssert(false);
    4849             :             }
    4850             :         }
    4851         705 :         else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
    4852             :         {
    4853           0 :             if (eWrkDataType == GDT_Float32)
    4854             :             {
    4855           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4856           0 :                 for (GPtrDiff_t i = 0;
    4857           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4858             :                      i++)
    4859             :                 {
    4860           0 :                     if (pafChunk[i] == 1.0)
    4861           0 :                         pafChunk[i] = 0.0;
    4862           0 :                     else if (pafChunk[i] == 0.0)
    4863           0 :                         pafChunk[i] = 255.0;
    4864             :                 }
    4865             :             }
    4866           0 :             else if (eWrkDataType == GDT_Byte)
    4867             :             {
    4868           0 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4869           0 :                 for (GPtrDiff_t i = 0;
    4870           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4871             :                      i++)
    4872             :                 {
    4873           0 :                     if (pabyChunk[i] == 1)
    4874           0 :                         pabyChunk[i] = 0;
    4875           0 :                     else if (pabyChunk[i] == 0)
    4876           0 :                         pabyChunk[i] = 255;
    4877             :                 }
    4878             :             }
    4879           0 :             else if (eWrkDataType == GDT_UInt16)
    4880             :             {
    4881           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4882           0 :                 for (GPtrDiff_t i = 0;
    4883           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4884             :                      i++)
    4885             :                 {
    4886           0 :                     if (pasChunk[i] == 1)
    4887           0 :                         pasChunk[i] = 0;
    4888           0 :                     else if (pasChunk[i] == 0)
    4889           0 :                         pasChunk[i] = 255;
    4890             :                 }
    4891             :             }
    4892           0 :             else if (eWrkDataType == GDT_Float64)
    4893             :             {
    4894           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4895           0 :                 for (GPtrDiff_t i = 0;
    4896           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4897             :                      i++)
    4898             :                 {
    4899           0 :                     if (padfChunk[i] == 1.0)
    4900           0 :                         padfChunk[i] = 0.0;
    4901           0 :                     else if (padfChunk[i] == 0.0)
    4902           0 :                         padfChunk[i] = 255.0;
    4903             :                 }
    4904             :             }
    4905             :             else
    4906             :             {
    4907           0 :                 CPLAssert(false);
    4908             :             }
    4909             :         }
    4910             : 
    4911             :         auto oSrcBufferHolder =
    4912        1428 :             std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
    4913             :         auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
    4914        1428 :             poJobQueue ? pabyChunkNodataMask : nullptr);
    4915             : 
    4916        1505 :         for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
    4917             :              ++iOverview)
    4918             :         {
    4919         791 :             GDALRasterBand *poDstBand = papoOvrBands[iOverview];
    4920         791 :             const int nDstWidth = poDstBand->GetXSize();
    4921         791 :             const int nDstHeight = poDstBand->GetYSize();
    4922             : 
    4923         791 :             const double dfXRatioDstToSrc =
    4924         791 :                 static_cast<double>(nWidth) / nDstWidth;
    4925         791 :             const double dfYRatioDstToSrc =
    4926         791 :                 static_cast<double>(nHeight) / nDstHeight;
    4927             : 
    4928             :             /* --------------------------------------------------------------------
    4929             :              */
    4930             :             /*      Figure out the line to start writing to, and the first line
    4931             :              */
    4932             :             /*      to not write to.  In theory this approach should ensure that
    4933             :              */
    4934             :             /*      every output line will be written if all input chunks are */
    4935             :             /*      processed. */
    4936             :             /* --------------------------------------------------------------------
    4937             :              */
    4938         791 :             int nDstYOff =
    4939         791 :                 static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
    4940         791 :             if (nDstYOff == nDstHeight)
    4941           0 :                 continue;
    4942         791 :             int nDstYOff2 = static_cast<int>(
    4943         791 :                 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
    4944             : 
    4945         791 :             if (nChunkYOff + nFullResYChunk == nHeight)
    4946         784 :                 nDstYOff2 = nDstHeight;
    4947             : #if DEBUG_VERBOSE
    4948             :             CPLDebug("GDAL",
    4949             :                      "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
    4950             :                      nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
    4951             :                      nDstWidth, nDstYOff2 - nDstYOff);
    4952             : #endif
    4953             : 
    4954        1582 :             auto poJob = std::make_unique<OvrJob>();
    4955         791 :             poJob->pfnResampleFn = pfnResampleFn;
    4956         791 :             poJob->bUseGenericResampleFn = bUseGenericResampleFn;
    4957         791 :             poJob->args.eOvrDataType = poDstBand->GetRasterDataType();
    4958         791 :             poJob->args.nOvrXSize = poDstBand->GetXSize();
    4959         791 :             poJob->args.nOvrYSize = poDstBand->GetYSize();
    4960             :             const char *pszNBITS =
    4961         791 :                 poDstBand->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    4962         791 :             poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    4963         791 :             poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    4964         791 :             poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    4965         791 :             poJob->args.eWrkDataType = eWrkDataType;
    4966         791 :             poJob->pChunk = pChunk;
    4967         791 :             poJob->args.pabyChunkNodataMask = pabyChunkNodataMask;
    4968         791 :             poJob->nSrcWidth = nWidth;
    4969         791 :             poJob->nSrcHeight = nHeight;
    4970         791 :             poJob->args.nChunkXOff = 0;
    4971         791 :             poJob->args.nChunkXSize = nWidth;
    4972         791 :             poJob->args.nChunkYOff = nChunkYOffQueried;
    4973         791 :             poJob->args.nChunkYSize = nChunkYSizeQueried;
    4974         791 :             poJob->nDstWidth = nDstWidth;
    4975         791 :             poJob->args.nDstXOff = 0;
    4976         791 :             poJob->args.nDstXOff2 = nDstWidth;
    4977         791 :             poJob->args.nDstYOff = nDstYOff;
    4978         791 :             poJob->args.nDstYOff2 = nDstYOff2;
    4979         791 :             poJob->poDstBand = poDstBand;
    4980         791 :             poJob->args.pszResampling = pszResampling;
    4981         791 :             poJob->args.bHasNoData = bHasNoData;
    4982         791 :             poJob->args.dfNoDataValue = dfNoDataValue;
    4983         791 :             poJob->args.poColorTable = poColorTable;
    4984         791 :             poJob->args.eSrcDataType = eSrcDataType;
    4985         791 :             poJob->args.bPropagateNoData = bPropagateNoData;
    4986             : 
    4987         791 :             if (poJobQueue)
    4988             :             {
    4989           0 :                 poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
    4990           0 :                 poJob->SetSrcBufferHolder(oSrcBufferHolder);
    4991           0 :                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    4992           0 :                 jobList.emplace_back(std::move(poJob));
    4993             :             }
    4994             :             else
    4995             :             {
    4996         791 :                 JobResampleFunc(poJob.get());
    4997         791 :                 eErr = poJob->eErr;
    4998         791 :                 if (eErr == CE_None)
    4999             :                 {
    5000         791 :                     eErr = WriteJobData(poJob.get());
    5001             :                 }
    5002             :             }
    5003             :         }
    5004             : 
    5005         714 :         if (poJobQueue)
    5006             :         {
    5007           0 :             pChunk = nullptr;
    5008           0 :             pabyChunkNodataMask = nullptr;
    5009             :         }
    5010             :     }
    5011             : 
    5012         709 :     VSIFree(pChunk);
    5013         709 :     VSIFree(pabyChunkNodataMask);
    5014             : 
    5015             :     // Wait for all pending jobs to complete
    5016         709 :     while (!jobList.empty())
    5017             :     {
    5018           0 :         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5019           0 :         if (l_eErr != CE_None && eErr == CE_None)
    5020           0 :             eErr = l_eErr;
    5021             :     }
    5022             : 
    5023             :     /* -------------------------------------------------------------------- */
    5024             :     /*      Renormalized overview mean / stddev if needed.                  */
    5025             :     /* -------------------------------------------------------------------- */
    5026         709 :     if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
    5027             :     {
    5028           0 :         GDALOverviewMagnitudeCorrection(
    5029             :             poSrcBand, nOverviewCount,
    5030             :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
    5031             :             GDALDummyProgress, nullptr);
    5032             :     }
    5033             : 
    5034             :     /* -------------------------------------------------------------------- */
    5035             :     /*      It can be important to flush out data to overviews.             */
    5036             :     /* -------------------------------------------------------------------- */
    5037        1493 :     for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
    5038             :          ++iOverview)
    5039             :     {
    5040         784 :         eErr = papoOvrBands[iOverview]->FlushCache(false);
    5041             :     }
    5042             : 
    5043         709 :     if (eErr == CE_None)
    5044         709 :         pfnProgress(1.0, nullptr, pProgressData);
    5045             : 
    5046         709 :     return eErr;
    5047             : }
    5048             : 
    5049             : /************************************************************************/
    5050             : /*            GDALRegenerateOverviewsMultiBand()                        */
    5051             : /************************************************************************/
    5052             : 
    5053             : /**
    5054             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5055             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5056             :  *
    5057             :  * This function will generate one or more overview images from a base
    5058             :  * image using the requested downsampling algorithm.  Its primary use
    5059             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5060             :  * can also be used to generate downsampled images in one file from another
    5061             :  * outside the overview architecture.
    5062             :  *
    5063             :  * The output bands need to exist in advance and share the same characteristics
    5064             :  * (type, dimensions)
    5065             :  *
    5066             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5067             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5068             :  *
    5069             :  * It does not support color tables or complex data types.
    5070             :  *
    5071             :  * The pseudo-algorithm used by the function is :
    5072             :  *    for each overview
    5073             :  *       iterate on lines of the source by a step of deltay
    5074             :  *           iterate on columns of the source  by a step of deltax
    5075             :  *               read the source data of size deltax * deltay for all the bands
    5076             :  *               generate the corresponding overview block for all the bands
    5077             :  *
    5078             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5079             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5080             :  * considered as the nodata value and not each value of the triplet
    5081             :  * independently per band.
    5082             :  *
    5083             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    5084             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5085             :  * overview computation.
    5086             :  *
    5087             :  * @param nBands the number of bands, size of papoSrcBands and size of
    5088             :  *               first dimension of papapoOverviewBands
    5089             :  * @param papoSrcBands the list of source bands to downsample
    5090             :  * @param nOverviews the number of downsampled overview levels being generated.
    5091             :  * @param papapoOverviewBands bidimension array of bands. First dimension is
    5092             :  *                            indexed by nBands. Second dimension is indexed by
    5093             :  *                            nOverviews.
    5094             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5095             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5096             :  * @param pfnProgress progress report function.
    5097             :  * @param pProgressData progress function callback data.
    5098             :  * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
    5099             :  *                     key=value pairs, or NULL
    5100             :  *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
    5101             :  *                     options can be specified to express that overviews should
    5102             :  *                     be regenerated only in the specified subset of the source
    5103             :  *                     dataset.
    5104             :  * @return CE_None on success or CE_Failure on failure.
    5105             :  */
    5106             : 
    5107         374 : CPLErr GDALRegenerateOverviewsMultiBand(
    5108             :     int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
    5109             :     GDALRasterBand *const *const *papapoOverviewBands,
    5110             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5111             :     void *pProgressData, CSLConstList papszOptions)
    5112             : {
    5113         374 :     CPL_IGNORE_RET_VAL(papszOptions);
    5114             : 
    5115         374 :     if (pfnProgress == nullptr)
    5116           6 :         pfnProgress = GDALDummyProgress;
    5117             : 
    5118         374 :     if (EQUAL(pszResampling, "NONE"))
    5119           2 :         return CE_None;
    5120             : 
    5121             :     // Sanity checks.
    5122         372 :     if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
    5123         177 :         !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
    5124          76 :         !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
    5125          18 :         !EQUAL(pszResampling, "CUBICSPLINE") &&
    5126          17 :         !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
    5127           5 :         !EQUAL(pszResampling, "MODE"))
    5128             :     {
    5129           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    5130             :                  "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
    5131             :                  "not supported",
    5132             :                  pszResampling);
    5133           0 :         return CE_Failure;
    5134             :     }
    5135             : 
    5136         372 :     int nKernelRadius = 0;
    5137             :     GDALResampleFunction pfnResampleFn =
    5138         372 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    5139         372 :     if (pfnResampleFn == nullptr)
    5140           0 :         return CE_Failure;
    5141             : 
    5142         372 :     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
    5143         372 :     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
    5144         372 :     if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
    5145           0 :         return CE_None;
    5146         372 :     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
    5147         688 :     for (int iBand = 1; iBand < nBands; ++iBand)
    5148             :     {
    5149         632 :         if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
    5150         316 :             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
    5151             :         {
    5152           0 :             CPLError(
    5153             :                 CE_Failure, CPLE_NotSupported,
    5154             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5155             :                 "have the same dimensions");
    5156           0 :             return CE_Failure;
    5157             :         }
    5158         316 :         if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
    5159             :         {
    5160           0 :             CPLError(
    5161             :                 CE_Failure, CPLE_NotSupported,
    5162             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5163             :                 "have the same data type");
    5164           0 :             return CE_Failure;
    5165             :         }
    5166             :     }
    5167             : 
    5168         988 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5169             :     {
    5170         616 :         const auto poOvrFirstBand = papapoOverviewBands[0][iOverview];
    5171         616 :         const int nDstWidth = poOvrFirstBand->GetXSize();
    5172         616 :         const int nDstHeight = poOvrFirstBand->GetYSize();
    5173        1210 :         for (int iBand = 1; iBand < nBands; ++iBand)
    5174             :         {
    5175         594 :             const auto poOvrBand = papapoOverviewBands[iBand][iOverview];
    5176        1188 :             if (poOvrBand->GetXSize() != nDstWidth ||
    5177         594 :                 poOvrBand->GetYSize() != nDstHeight)
    5178             :             {
    5179           0 :                 CPLError(
    5180             :                     CE_Failure, CPLE_NotSupported,
    5181             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5182             :                     "of the same level must have the same dimensions");
    5183           0 :                 return CE_Failure;
    5184             :             }
    5185         594 :             if (poOvrBand->GetRasterDataType() != eDataType)
    5186             :             {
    5187           0 :                 CPLError(
    5188             :                     CE_Failure, CPLE_NotSupported,
    5189             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5190             :                     "must have the same data type as the source bands");
    5191           0 :                 return CE_Failure;
    5192             :             }
    5193             :         }
    5194             :     }
    5195             : 
    5196             :     // First pass to compute the total number of pixels to write.
    5197         372 :     double dfTotalPixelCount = 0;
    5198         372 :     const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
    5199         372 :     const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
    5200         372 :     const int nSrcXSize = atoi(CSLFetchNameValueDef(
    5201             :         papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
    5202         372 :     const int nSrcYSize = atoi(CSLFetchNameValueDef(
    5203             :         papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
    5204         988 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5205             :     {
    5206         616 :         dfTotalPixelCount +=
    5207        1232 :             static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5208         616 :             papapoOverviewBands[0][iOverview]->GetXSize() *
    5209        1232 :             static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5210         616 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5211             :     }
    5212             : 
    5213             :     const GDALDataType eWrkDataType =
    5214         372 :         GDALGetOvrWorkDataType(pszResampling, eDataType);
    5215         372 :     const int nWrkDataTypeSize = GDALGetDataTypeSizeBytes(eWrkDataType);
    5216             : 
    5217         372 :     const bool bIsMask = papoSrcBands[0]->IsMaskBand();
    5218             : 
    5219             :     // If we have a nodata mask and we are doing something more complicated
    5220             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5221             :     const bool bUseNoDataMask =
    5222         541 :         !STARTS_WITH_CI(pszResampling, "NEAR") &&
    5223         169 :         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
    5224             : 
    5225             :     bool *const pabHasNoData =
    5226         372 :         static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
    5227             :     double *const padfNoDataValue =
    5228         372 :         static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
    5229         372 :     if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
    5230             :     {
    5231           0 :         CPLFree(pabHasNoData);
    5232           0 :         CPLFree(padfNoDataValue);
    5233           0 :         return CE_Failure;
    5234             :     }
    5235             : 
    5236        1060 :     for (int iBand = 0; iBand < nBands; ++iBand)
    5237             :     {
    5238         688 :         int nHasNoData = 0;
    5239        1376 :         padfNoDataValue[iBand] =
    5240         688 :             papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
    5241         688 :         pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
    5242             :     }
    5243             :     const bool bPropagateNoData =
    5244         372 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5245             : 
    5246         372 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    5247        1488 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    5248         372 :                                                        ? CPLGetNumCPUs()
    5249         372 :                                                        : atoi(pszThreads)));
    5250             :     auto poThreadPool =
    5251         372 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5252             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5253         372 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5254             : 
    5255             :     // Only configurable for debug / testing
    5256             :     const int nChunkMaxSize = std::max(
    5257         372 :         100, atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760")));
    5258             : 
    5259             :     // Second pass to do the real job.
    5260         372 :     double dfCurPixelCount = 0;
    5261         372 :     CPLErr eErr = CE_None;
    5262         987 :     for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
    5263             :          ++iOverview)
    5264             :     {
    5265         615 :         int iSrcOverview = -1;  // -1 means the source bands.
    5266             : 
    5267             :         const int nDstTotalWidth =
    5268         615 :             papapoOverviewBands[0][iOverview]->GetXSize();
    5269             :         const int nDstTotalHeight =
    5270         615 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5271             : 
    5272             :         // Compute the coordinates of the target region to refresh
    5273         615 :         constexpr double EPS = 1e-8;
    5274         615 :         const int nDstXOffStart = static_cast<int>(
    5275         615 :             static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
    5276             :             EPS);
    5277             :         const int nDstXOffEnd =
    5278        1230 :             std::min(static_cast<int>(
    5279         615 :                          std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
    5280         615 :                                        nToplevelSrcWidth * nDstTotalWidth -
    5281             :                                    EPS)),
    5282         615 :                      nDstTotalWidth);
    5283         615 :         const int nDstWidth = nDstXOffEnd - nDstXOffStart;
    5284         615 :         const int nDstYOffStart =
    5285         615 :             static_cast<int>(static_cast<double>(nSrcYOff) /
    5286         615 :                                  nToplevelSrcHeight * nDstTotalHeight +
    5287             :                              EPS);
    5288             :         const int nDstYOffEnd =
    5289        1230 :             std::min(static_cast<int>(
    5290         615 :                          std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
    5291         615 :                                        nToplevelSrcHeight * nDstTotalHeight -
    5292             :                                    EPS)),
    5293         615 :                      nDstTotalHeight);
    5294             : 
    5295             :         // Try to use previous level of overview as the source to compute
    5296             :         // the next level.
    5297         615 :         int nSrcWidth = nToplevelSrcWidth;
    5298         615 :         int nSrcHeight = nToplevelSrcHeight;
    5299         858 :         if (iOverview > 0 &&
    5300         243 :             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
    5301             :         {
    5302         235 :             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
    5303         235 :             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
    5304         235 :             iSrcOverview = iOverview - 1;
    5305             :         }
    5306             : 
    5307         615 :         const double dfXRatioDstToSrc =
    5308         615 :             static_cast<double>(nSrcWidth) / nDstTotalWidth;
    5309         615 :         const double dfYRatioDstToSrc =
    5310         615 :             static_cast<double>(nSrcHeight) / nDstTotalHeight;
    5311             : 
    5312        1230 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    5313         615 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    5314         615 :         if (nOvrFactor == 0)
    5315           0 :             nOvrFactor = 1;
    5316             : 
    5317         615 :         int nDstChunkXSize = 0;
    5318         615 :         int nDstChunkYSize = 0;
    5319         615 :         papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
    5320             :                                                         &nDstChunkYSize);
    5321             : 
    5322             :         const char *pszDST_CHUNK_X_SIZE =
    5323         615 :             CSLFetchNameValue(papszOptions, "DST_CHUNK_X_SIZE");
    5324             :         const char *pszDST_CHUNK_Y_SIZE =
    5325         615 :             CSLFetchNameValue(papszOptions, "DST_CHUNK_Y_SIZE");
    5326         615 :         if (pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE)
    5327             :         {
    5328          12 :             nDstChunkXSize = std::max(1, atoi(pszDST_CHUNK_X_SIZE));
    5329          12 :             nDstChunkYSize = std::max(1, atoi(pszDST_CHUNK_Y_SIZE));
    5330          12 :             CPLDebug("GDAL", "Using dst chunk size %d x %d", nDstChunkXSize,
    5331             :                      nDstChunkYSize);
    5332             :         }
    5333             : 
    5334             :         // Try to extend the chunk size so that the memory needed to acquire
    5335             :         // source pixels goes up to 10 MB.
    5336             :         // This can help for drivers that support multi-threaded reading
    5337         615 :         const int nFullResYChunk =
    5338         615 :             2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
    5339         615 :         const int nFullResYChunkQueried =
    5340         615 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    5341         857 :         while (nDstChunkXSize < nDstWidth)
    5342             :         {
    5343         259 :             const int nFullResXChunk =
    5344         259 :                 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
    5345             : 
    5346         259 :             const int nFullResXChunkQueried =
    5347         259 :                 nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    5348             : 
    5349         259 :             if (static_cast<GIntBig>(nFullResXChunkQueried) *
    5350         259 :                     nFullResYChunkQueried * nBands * nWrkDataTypeSize >
    5351         259 :                 nChunkMaxSize)
    5352             :             {
    5353          17 :                 break;
    5354             :             }
    5355             : 
    5356         242 :             nDstChunkXSize *= 2;
    5357             :         }
    5358         615 :         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
    5359             : 
    5360         615 :         const int nFullResXChunk =
    5361         615 :             2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
    5362         615 :         const int nFullResXChunkQueried =
    5363         615 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    5364             : 
    5365             :         // Make sure that the RAM requirements to acquire the source data does
    5366             :         // not exceed nChunkMaxSize
    5367             :         // If so, reduce the destination chunk size, generate overviews in a
    5368             :         // temporary dataset, and copy that temporary dataset over the target
    5369             :         // overview bands (to avoid issues with lossy compression)
    5370         615 :         const auto nMemRequirement =
    5371         615 :             static_cast<GIntBig>(nFullResXChunkQueried) *
    5372         615 :             nFullResYChunkQueried * nBands * nWrkDataTypeSize;
    5373         615 :         if (nMemRequirement > nChunkMaxSize &&
    5374          10 :             !(pszDST_CHUNK_X_SIZE && pszDST_CHUNK_Y_SIZE))
    5375             :         {
    5376             :             // Compute a smaller destination chunk size
    5377          12 :             const auto nOverShootFactor = nMemRequirement / nChunkMaxSize;
    5378             :             const auto nSqrtOverShootFactor = std::max<GIntBig>(
    5379          24 :                 4, static_cast<GIntBig>(std::ceil(
    5380          12 :                        std::sqrt(static_cast<double>(nOverShootFactor)))));
    5381             :             const int nReducedDstChunkXSize = std::max(
    5382          12 :                 1, static_cast<int>(nDstChunkXSize / nSqrtOverShootFactor));
    5383             :             const int nReducedDstChunkYSize = std::max(
    5384          12 :                 1, static_cast<int>(nDstChunkYSize / nSqrtOverShootFactor));
    5385          12 :             if (nReducedDstChunkXSize < nDstChunkXSize ||
    5386           0 :                 nReducedDstChunkYSize < nDstChunkYSize)
    5387             :             {
    5388          12 :                 CPLStringList aosOptions(papszOptions);
    5389             :                 aosOptions.SetNameValue(
    5390             :                     "DST_CHUNK_X_SIZE",
    5391          12 :                     CPLSPrintf("%d", nReducedDstChunkXSize));
    5392             :                 aosOptions.SetNameValue(
    5393             :                     "DST_CHUNK_Y_SIZE",
    5394          12 :                     CPLSPrintf("%d", nReducedDstChunkYSize));
    5395             : 
    5396             :                 const auto nTmpDSMemRequirement =
    5397          12 :                     static_cast<GIntBig>(nDstTotalWidth) * nDstTotalHeight *
    5398          12 :                     nBands * GDALGetDataTypeSizeBytes(eDataType);
    5399           0 :                 std::unique_ptr<GDALDataset> poTmpDS;
    5400             :                 // Config option mostly/only for autotest purposes
    5401             :                 const char *pszGDAL_OVR_TEMP_DRIVER =
    5402          12 :                     CPLGetConfigOption("GDAL_OVR_TEMP_DRIVER", "");
    5403          12 :                 if ((nTmpDSMemRequirement <= nChunkMaxSize &&
    5404           2 :                      !EQUAL(pszGDAL_OVR_TEMP_DRIVER, "GTIFF")) ||
    5405          10 :                     EQUAL(pszGDAL_OVR_TEMP_DRIVER, "MEM"))
    5406             :                 {
    5407             :                     auto poTmpDrv =
    5408          11 :                         GetGDALDriverManager()->GetDriverByName("MEM");
    5409          11 :                     if (!poTmpDrv)
    5410             :                     {
    5411           0 :                         eErr = CE_Failure;
    5412           0 :                         break;
    5413             :                     }
    5414          11 :                     poTmpDS.reset(poTmpDrv->Create("", nDstTotalWidth,
    5415             :                                                    nDstTotalHeight, nBands,
    5416          11 :                                                    eDataType, nullptr));
    5417             :                 }
    5418             :                 else
    5419             :                 {
    5420             :                     auto poTmpDrv =
    5421           1 :                         GetGDALDriverManager()->GetDriverByName("GTiff");
    5422           1 :                     if (!poTmpDrv)
    5423             :                     {
    5424           0 :                         eErr = CE_Failure;
    5425           0 :                         break;
    5426             :                     }
    5427           2 :                     std::string osTmpFilename;
    5428           1 :                     auto poDstDS = papapoOverviewBands[0][0]->GetDataset();
    5429           1 :                     if (poDstDS)
    5430             :                     {
    5431           1 :                         osTmpFilename = poDstDS->GetDescription();
    5432             :                         VSIStatBufL sStatBuf;
    5433           1 :                         if (!osTmpFilename.empty() &&
    5434           0 :                             VSIStatL(osTmpFilename.c_str(), &sStatBuf) == 0)
    5435           0 :                             osTmpFilename += "_tmp_ovr.tif";
    5436             :                     }
    5437           1 :                     if (osTmpFilename.empty())
    5438             :                     {
    5439           1 :                         osTmpFilename = CPLGenerateTempFilenameSafe(nullptr);
    5440           1 :                         osTmpFilename += ".tif";
    5441             :                     }
    5442           1 :                     CPLDebug("GDAL",
    5443             :                              "Creating temporary file %s of %d x %d x %d",
    5444             :                              osTmpFilename.c_str(), nDstTotalWidth,
    5445             :                              nDstTotalHeight, nBands);
    5446           2 :                     CPLStringList aosCO;
    5447           1 :                     poTmpDS.reset(poTmpDrv->Create(
    5448             :                         osTmpFilename.c_str(), nDstTotalWidth, nDstTotalHeight,
    5449           1 :                         nBands, eDataType, aosCO.List()));
    5450           1 :                     if (poTmpDS)
    5451             :                     {
    5452           1 :                         poTmpDS->MarkSuppressOnClose();
    5453           1 :                         VSIUnlink(osTmpFilename.c_str());
    5454             :                     }
    5455             :                 }
    5456          12 :                 if (!poTmpDS)
    5457             :                 {
    5458           0 :                     eErr = CE_Failure;
    5459           0 :                     break;
    5460             :                 }
    5461             : 
    5462          12 :                 std::vector<GDALRasterBand **> apapoOverviewBands(nBands);
    5463          27 :                 for (int i = 0; i < nBands; ++i)
    5464             :                 {
    5465          30 :                     apapoOverviewBands[i] = static_cast<GDALRasterBand **>(
    5466          15 :                         CPLMalloc(sizeof(GDALRasterBand *)));
    5467          15 :                     apapoOverviewBands[i][0] = poTmpDS->GetRasterBand(i + 1);
    5468             :                 }
    5469             : 
    5470             :                 const double dfExtraPixels =
    5471          24 :                     static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5472          12 :                     papapoOverviewBands[0][iOverview]->GetXSize() *
    5473          24 :                     static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5474          12 :                     papapoOverviewBands[0][iOverview]->GetYSize();
    5475             : 
    5476          24 :                 void *pScaledProgressData = GDALCreateScaledProgress(
    5477             :                     dfCurPixelCount / dfTotalPixelCount,
    5478          12 :                     (dfCurPixelCount + dfExtraPixels) / dfTotalPixelCount,
    5479             :                     pfnProgress, pProgressData);
    5480             : 
    5481             :                 // Generate overviews in temporary dataset
    5482          12 :                 eErr = GDALRegenerateOverviewsMultiBand(
    5483          12 :                     nBands, papoSrcBands, 1, apapoOverviewBands.data(),
    5484             :                     pszResampling, GDALScaledProgress, pScaledProgressData,
    5485          12 :                     aosOptions.List());
    5486             : 
    5487          12 :                 GDALDestroyScaledProgress(pScaledProgressData);
    5488             : 
    5489          12 :                 dfCurPixelCount += dfExtraPixels;
    5490             : 
    5491          27 :                 for (int i = 0; i < nBands; ++i)
    5492             :                 {
    5493          15 :                     CPLFree(apapoOverviewBands[i]);
    5494             :                 }
    5495             : 
    5496             :                 // Copy temporary dataset to destination overview bands
    5497             : 
    5498          12 :                 if (eErr == CE_None)
    5499             :                 {
    5500             :                     // Check if all papapoOverviewBands[][iOverview] bands point
    5501             :                     // to the same dataset. If so, we can use
    5502             :                     // GDALDatasetCopyWholeRaster()
    5503             :                     GDALDataset *poDstOvrBandDS =
    5504          12 :                         papapoOverviewBands[0][iOverview]->GetDataset();
    5505          12 :                     if (poDstOvrBandDS)
    5506             :                     {
    5507          15 :                         if (poDstOvrBandDS->GetRasterCount() != nBands ||
    5508           3 :                             poDstOvrBandDS->GetRasterBand(1) !=
    5509           3 :                                 papapoOverviewBands[0][iOverview])
    5510             :                         {
    5511           9 :                             poDstOvrBandDS = nullptr;
    5512             :                         }
    5513             :                         else
    5514             :                         {
    5515           6 :                             for (int i = 1; poDstOvrBandDS && i < nBands; ++i)
    5516             :                             {
    5517             :                                 GDALDataset *poThisDstOvrBandDS =
    5518           3 :                                     papapoOverviewBands[i][iOverview]
    5519           3 :                                         ->GetDataset();
    5520           3 :                                 if (poThisDstOvrBandDS == nullptr ||
    5521           6 :                                     poThisDstOvrBandDS != poDstOvrBandDS ||
    5522           3 :                                     poThisDstOvrBandDS->GetRasterBand(i + 1) !=
    5523           3 :                                         papapoOverviewBands[i][iOverview])
    5524             :                                 {
    5525           0 :                                     poDstOvrBandDS = nullptr;
    5526             :                                 }
    5527             :                             }
    5528             :                         }
    5529             :                     }
    5530          12 :                     if (poDstOvrBandDS)
    5531             :                     {
    5532           3 :                         eErr = GDALDatasetCopyWholeRaster(
    5533             :                             GDALDataset::ToHandle(poTmpDS.get()),
    5534             :                             GDALDataset::ToHandle(poDstOvrBandDS), nullptr,
    5535             :                             nullptr, nullptr);
    5536             :                     }
    5537             :                     else
    5538             :                     {
    5539          18 :                         for (int i = 0; eErr == CE_None && i < nBands; ++i)
    5540             :                         {
    5541           9 :                             eErr = GDALRasterBandCopyWholeRaster(
    5542             :                                 GDALRasterBand::ToHandle(
    5543             :                                     poTmpDS->GetRasterBand(i + 1)),
    5544             :                                 GDALRasterBand::ToHandle(
    5545           9 :                                     papapoOverviewBands[i][iOverview]),
    5546             :                                 nullptr, nullptr, nullptr);
    5547             :                         }
    5548             :                     }
    5549             :                 }
    5550             : 
    5551          12 :                 if (eErr != CE_None)
    5552           0 :                     break;
    5553             : 
    5554          12 :                 continue;
    5555             :             }
    5556             :         }
    5557             : 
    5558             :         // Structure describing a resampling job
    5559             :         struct OvrJob
    5560             :         {
    5561             :             // Buffers to free when job is finished
    5562             :             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
    5563             :             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
    5564             :             std::unique_ptr<PointerHolder> oDstBufferHolder{};
    5565             : 
    5566             :             GDALRasterBand *poDstBand = nullptr;
    5567             : 
    5568             :             // Input parameters of pfnResampleFn
    5569             :             GDALResampleFunction pfnResampleFn = nullptr;
    5570             :             GDALOverviewResampleArgs args{};
    5571             :             const void *pChunk = nullptr;
    5572             : 
    5573             :             // Output values of resampling function
    5574             :             CPLErr eErr = CE_Failure;
    5575             :             void *pDstBuffer = nullptr;
    5576             :             GDALDataType eDstBufferDataType = GDT_Unknown;
    5577             : 
    5578             :             // Synchronization
    5579             :             bool bFinished = false;
    5580             :             std::mutex mutex{};
    5581             :             std::condition_variable cv{};
    5582             :         };
    5583             : 
    5584             :         // Thread function to resample
    5585       16316 :         const auto JobResampleFunc = [](void *pData)
    5586             :         {
    5587       16316 :             OvrJob *poJob = static_cast<OvrJob *>(pData);
    5588             : 
    5589       16316 :             poJob->eErr = poJob->pfnResampleFn(poJob->args, poJob->pChunk,
    5590             :                                                &(poJob->pDstBuffer),
    5591             :                                                &(poJob->eDstBufferDataType));
    5592             : 
    5593       16316 :             poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
    5594             : 
    5595             :             {
    5596       32632 :                 std::lock_guard<std::mutex> guard(poJob->mutex);
    5597       16316 :                 poJob->bFinished = true;
    5598       16316 :                 poJob->cv.notify_one();
    5599             :             }
    5600       16316 :         };
    5601             : 
    5602             :         // Function to write resample data to target band
    5603       16316 :         const auto WriteJobData = [](const OvrJob *poJob)
    5604             :         {
    5605       32632 :             return poJob->poDstBand->RasterIO(
    5606       16316 :                 GF_Write, poJob->args.nDstXOff, poJob->args.nDstYOff,
    5607       16316 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    5608       16316 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff, poJob->pDstBuffer,
    5609       16316 :                 poJob->args.nDstXOff2 - poJob->args.nDstXOff,
    5610       16316 :                 poJob->args.nDstYOff2 - poJob->args.nDstYOff,
    5611       16316 :                 poJob->eDstBufferDataType, 0, 0, nullptr);
    5612             :         };
    5613             : 
    5614             :         // Wait for completion of oldest job and serialize it
    5615             :         const auto WaitAndFinalizeOldestJob =
    5616          38 :             [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    5617             :         {
    5618          38 :             auto poOldestJob = jobList.front().get();
    5619             :             {
    5620          76 :                 std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
    5621             :                 // coverity[missing_lock:FALSE]
    5622          52 :                 while (!poOldestJob->bFinished)
    5623             :                 {
    5624          14 :                     poOldestJob->cv.wait(oGuard);
    5625             :                 }
    5626             :             }
    5627          38 :             CPLErr l_eErr = poOldestJob->eErr;
    5628          38 :             if (l_eErr == CE_None)
    5629             :             {
    5630          38 :                 l_eErr = WriteJobData(poOldestJob);
    5631             :             }
    5632             : 
    5633          38 :             jobList.pop_front();
    5634          38 :             return l_eErr;
    5635             :         };
    5636             : 
    5637             :         // Queue of jobs
    5638        1206 :         std::list<std::unique_ptr<OvrJob>> jobList;
    5639             : 
    5640        1206 :         std::vector<void *> apaChunk(nBands);
    5641        1206 :         std::vector<GByte *> apabyChunkNoDataMask(nBands);
    5642             : 
    5643             :         // Iterate on destination overview, block by block.
    5644         603 :         for (int nDstYOff = nDstYOffStart;
    5645        2278 :              nDstYOff < nDstYOffEnd && eErr == CE_None;
    5646        1675 :              nDstYOff += nDstChunkYSize)
    5647             :         {
    5648             :             int nDstYCount;
    5649        1675 :             if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
    5650        1257 :                 nDstYCount = nDstChunkYSize;
    5651             :             else
    5652         418 :                 nDstYCount = nDstYOffEnd - nDstYOff;
    5653             : 
    5654        1675 :             int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    5655        1675 :             int nChunkYOff2 = static_cast<int>(
    5656        1675 :                 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
    5657        1675 :             if (nChunkYOff2 > nSrcHeight ||
    5658        1675 :                 nDstYOff + nDstYCount == nDstTotalHeight)
    5659         600 :                 nChunkYOff2 = nSrcHeight;
    5660        1675 :             int nYCount = nChunkYOff2 - nChunkYOff;
    5661        1675 :             CPLAssert(nYCount <= nFullResYChunk);
    5662             : 
    5663        1675 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    5664        1675 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    5665        1675 :             if (nChunkYOffQueried < 0)
    5666             :             {
    5667         140 :                 nChunkYSizeQueried += nChunkYOffQueried;
    5668         140 :                 nChunkYOffQueried = 0;
    5669             :             }
    5670        1675 :             if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
    5671         139 :                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
    5672        1675 :             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
    5673             : 
    5674        1675 :             if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
    5675             :                              pProgressData))
    5676             :             {
    5677           1 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5678           1 :                 eErr = CE_Failure;
    5679             :             }
    5680             : 
    5681             :             // Iterate on destination overview, block by block.
    5682        1675 :             for (int nDstXOff = nDstXOffStart;
    5683       10129 :                  nDstXOff < nDstXOffEnd && eErr == CE_None;
    5684        8454 :                  nDstXOff += nDstChunkXSize)
    5685             :             {
    5686        8454 :                 int nDstXCount = 0;
    5687        8454 :                 if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
    5688        8257 :                     nDstXCount = nDstChunkXSize;
    5689             :                 else
    5690         197 :                     nDstXCount = nDstXOffEnd - nDstXOff;
    5691             : 
    5692        8454 :                 dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
    5693             : 
    5694        8454 :                 int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    5695        8454 :                 int nChunkXOff2 = static_cast<int>(
    5696        8454 :                     ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    5697        8454 :                 if (nChunkXOff2 > nSrcWidth ||
    5698        8454 :                     nDstXOff + nDstXCount == nDstTotalWidth)
    5699        1673 :                     nChunkXOff2 = nSrcWidth;
    5700        8454 :                 const int nXCount = nChunkXOff2 - nChunkXOff;
    5701        8454 :                 CPLAssert(nXCount <= nFullResXChunk);
    5702             : 
    5703        8454 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    5704        8454 :                 int nChunkXSizeQueried =
    5705        8454 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    5706        8454 :                 if (nChunkXOffQueried < 0)
    5707             :                 {
    5708         200 :                     nChunkXSizeQueried += nChunkXOffQueried;
    5709         200 :                     nChunkXOffQueried = 0;
    5710             :                 }
    5711        8454 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
    5712         203 :                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
    5713        8454 :                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
    5714             : #if DEBUG_VERBOSE
    5715             :                 CPLDebug("GDAL",
    5716             :                          "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
    5717             :                          nChunkXOffQueried, nChunkYOffQueried,
    5718             :                          nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
    5719             :                          nDstYOff, nDstXCount, nDstYCount);
    5720             : #endif
    5721             : 
    5722             :                 // Avoid accumulating too many tasks and exhaust RAM
    5723             : 
    5724             :                 // Try to complete already finished jobs
    5725       16528 :                 while (eErr == CE_None && !jobList.empty())
    5726             :                 {
    5727        8133 :                     auto poOldestJob = jobList.front().get();
    5728             :                     {
    5729        8133 :                         std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
    5730        8133 :                         if (!poOldestJob->bFinished)
    5731             :                         {
    5732          59 :                             break;
    5733             :                         }
    5734             :                     }
    5735        8074 :                     eErr = poOldestJob->eErr;
    5736        8074 :                     if (eErr == CE_None)
    5737             :                     {
    5738        8074 :                         eErr = WriteJobData(poOldestJob);
    5739             :                     }
    5740             : 
    5741        8074 :                     jobList.pop_front();
    5742             :                 }
    5743             : 
    5744             :                 // And in case we have saturated the number of threads,
    5745             :                 // wait for completion of tasks to go below the threshold.
    5746       16952 :                 while (eErr == CE_None &&
    5747        8476 :                        jobList.size() >= static_cast<size_t>(nThreads))
    5748             :                 {
    5749          22 :                     eErr = WaitAndFinalizeOldestJob(jobList);
    5750             :                 }
    5751             : 
    5752             :                 // (Re)allocate buffers if needed
    5753       24771 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5754             :                 {
    5755       16317 :                     if (apaChunk[iBand] == nullptr)
    5756             :                     {
    5757        9292 :                         apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
    5758             :                             nFullResXChunkQueried, nFullResYChunkQueried,
    5759             :                             nWrkDataTypeSize);
    5760        9292 :                         if (apaChunk[iBand] == nullptr)
    5761             :                         {
    5762           0 :                             eErr = CE_Failure;
    5763             :                         }
    5764             :                     }
    5765       24754 :                     if (bUseNoDataMask &&
    5766        8437 :                         apabyChunkNoDataMask[iBand] == nullptr)
    5767             :                     {
    5768       16756 :                         apabyChunkNoDataMask[iBand] =
    5769        8378 :                             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    5770             :                                 nFullResXChunkQueried, nFullResYChunkQueried));
    5771        8378 :                         if (apabyChunkNoDataMask[iBand] == nullptr)
    5772             :                         {
    5773           0 :                             eErr = CE_Failure;
    5774             :                         }
    5775             :                     }
    5776             :                 }
    5777             : 
    5778             :                 // Read the source buffers for all the bands.
    5779       24771 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    5780             :                 {
    5781       16317 :                     GDALRasterBand *poSrcBand = nullptr;
    5782       16317 :                     if (iSrcOverview == -1)
    5783       15405 :                         poSrcBand = papoSrcBands[iBand];
    5784             :                     else
    5785         912 :                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
    5786       16317 :                     eErr = poSrcBand->RasterIO(
    5787             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    5788       16317 :                         nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
    5789             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
    5790             :                         0, nullptr);
    5791             : 
    5792       16317 :                     if (bUseNoDataMask && eErr == CE_None)
    5793             :                     {
    5794        8437 :                         auto poMaskBand = poSrcBand->IsMaskBand()
    5795        8437 :                                               ? poSrcBand
    5796        6334 :                                               : poSrcBand->GetMaskBand();
    5797        8437 :                         eErr = poMaskBand->RasterIO(
    5798             :                             GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    5799             :                             nChunkXSizeQueried, nChunkYSizeQueried,
    5800        8437 :                             apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
    5801             :                             nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    5802             :                     }
    5803             :                 }
    5804             : 
    5805             :                 // Compute the resulting overview block.
    5806       24770 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    5807             :                 {
    5808       32632 :                     auto poJob = std::make_unique<OvrJob>();
    5809       16316 :                     poJob->pfnResampleFn = pfnResampleFn;
    5810       16316 :                     poJob->poDstBand = papapoOverviewBands[iBand][iOverview];
    5811       32632 :                     poJob->args.eOvrDataType =
    5812       16316 :                         poJob->poDstBand->GetRasterDataType();
    5813       16316 :                     poJob->args.nOvrXSize = poJob->poDstBand->GetXSize();
    5814       16316 :                     poJob->args.nOvrYSize = poJob->poDstBand->GetYSize();
    5815       16316 :                     const char *pszNBITS = poJob->poDstBand->GetMetadataItem(
    5816       16316 :                         "NBITS", "IMAGE_STRUCTURE");
    5817       16316 :                     poJob->args.nOvrNBITS = pszNBITS ? atoi(pszNBITS) : 0;
    5818       16316 :                     poJob->args.dfXRatioDstToSrc = dfXRatioDstToSrc;
    5819       16316 :                     poJob->args.dfYRatioDstToSrc = dfYRatioDstToSrc;
    5820       16316 :                     poJob->args.eWrkDataType = eWrkDataType;
    5821       16316 :                     poJob->pChunk = apaChunk[iBand];
    5822       16316 :                     poJob->args.pabyChunkNodataMask =
    5823       16316 :                         apabyChunkNoDataMask[iBand];
    5824       16316 :                     poJob->args.nChunkXOff = nChunkXOffQueried;
    5825       16316 :                     poJob->args.nChunkXSize = nChunkXSizeQueried;
    5826       16316 :                     poJob->args.nChunkYOff = nChunkYOffQueried;
    5827       16316 :                     poJob->args.nChunkYSize = nChunkYSizeQueried;
    5828       16316 :                     poJob->args.nDstXOff = nDstXOff;
    5829       16316 :                     poJob->args.nDstXOff2 = nDstXOff + nDstXCount;
    5830       16316 :                     poJob->args.nDstYOff = nDstYOff;
    5831       16316 :                     poJob->args.nDstYOff2 = nDstYOff + nDstYCount;
    5832       16316 :                     poJob->args.pszResampling = pszResampling;
    5833       16316 :                     poJob->args.bHasNoData = pabHasNoData[iBand];
    5834       16316 :                     poJob->args.dfNoDataValue = padfNoDataValue[iBand];
    5835       16316 :                     poJob->args.eSrcDataType = eDataType;
    5836       16316 :                     poJob->args.bPropagateNoData = bPropagateNoData;
    5837             : 
    5838       16316 :                     if (poJobQueue)
    5839             :                     {
    5840       16224 :                         poJob->oSrcMaskBufferHolder.reset(
    5841        8112 :                             new PointerHolder(apabyChunkNoDataMask[iBand]));
    5842        8112 :                         apabyChunkNoDataMask[iBand] = nullptr;
    5843             : 
    5844       16224 :                         poJob->oSrcBufferHolder.reset(
    5845        8112 :                             new PointerHolder(apaChunk[iBand]));
    5846        8112 :                         apaChunk[iBand] = nullptr;
    5847             : 
    5848        8112 :                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    5849        8112 :                         jobList.emplace_back(std::move(poJob));
    5850             :                     }
    5851             :                     else
    5852             :                     {
    5853        8204 :                         JobResampleFunc(poJob.get());
    5854        8204 :                         eErr = poJob->eErr;
    5855        8204 :                         if (eErr == CE_None)
    5856             :                         {
    5857        8204 :                             eErr = WriteJobData(poJob.get());
    5858             :                         }
    5859             :                     }
    5860             :                 }
    5861             :             }
    5862             :         }
    5863             : 
    5864             :         // Wait for all pending jobs to complete
    5865         619 :         while (!jobList.empty())
    5866             :         {
    5867          16 :             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5868          16 :             if (l_eErr != CE_None && eErr == CE_None)
    5869           0 :                 eErr = l_eErr;
    5870             :         }
    5871             : 
    5872             :         // Flush the data to overviews.
    5873        1797 :         for (int iBand = 0; iBand < nBands; ++iBand)
    5874             :         {
    5875        1194 :             CPLFree(apaChunk[iBand]);
    5876        1194 :             papapoOverviewBands[iBand][iOverview]->FlushCache(false);
    5877             : 
    5878        1194 :             CPLFree(apabyChunkNoDataMask[iBand]);
    5879             :         }
    5880             :     }
    5881             : 
    5882         372 :     CPLFree(pabHasNoData);
    5883         372 :     CPLFree(padfNoDataValue);
    5884             : 
    5885         372 :     if (eErr == CE_None)
    5886         370 :         pfnProgress(1.0, nullptr, pProgressData);
    5887             : 
    5888         372 :     return eErr;
    5889             : }
    5890             : 
    5891             : /************************************************************************/
    5892             : /*            GDALRegenerateOverviewsMultiBand()                        */
    5893             : /************************************************************************/
    5894             : 
    5895             : /**
    5896             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    5897             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    5898             :  *
    5899             :  * This function will generate one or more overview images from a base
    5900             :  * image using the requested downsampling algorithm.  Its primary use
    5901             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    5902             :  * can also be used to generate downsampled images in one file from another
    5903             :  * outside the overview architecture.
    5904             :  *
    5905             :  * The output bands need to exist in advance and share the same characteristics
    5906             :  * (type, dimensions)
    5907             :  *
    5908             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    5909             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    5910             :  *
    5911             :  * It does not support color tables or complex data types.
    5912             :  *
    5913             :  * The pseudo-algorithm used by the function is :
    5914             :  *    for each overview
    5915             :  *       iterate on lines of the source by a step of deltay
    5916             :  *           iterate on columns of the source  by a step of deltax
    5917             :  *               read the source data of size deltax * deltay for all the bands
    5918             :  *               generate the corresponding overview block for all the bands
    5919             :  *
    5920             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    5921             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    5922             :  * considered as the nodata value and not each value of the triplet
    5923             :  * independently per band.
    5924             :  *
    5925             :  * The GDAL_NUM_THREADS configuration option can be set
    5926             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    5927             :  * overview computation.
    5928             :  *
    5929             :  * @param apoSrcBands the list of source bands to downsample
    5930             :  * @param aapoOverviewBands bidimension array of bands. First dimension is
    5931             :  *                          indexed by bands. Second dimension is indexed by
    5932             :  *                          overview levels. All aapoOverviewBands[i] arrays
    5933             :  *                          must have the same size (i.e. same number of
    5934             :  *                          overviews)
    5935             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    5936             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    5937             :  * @param pfnProgress progress report function.
    5938             :  * @param pProgressData progress function callback data.
    5939             :  * @param papszOptions NULL terminated list of options as
    5940             :  *                     key=value pairs, or NULL
    5941             :  *                     The XOFF, YOFF, XSIZE and YSIZE
    5942             :  *                     options can be specified to express that overviews should
    5943             :  *                     be regenerated only in the specified subset of the source
    5944             :  *                     dataset.
    5945             :  * @return CE_None on success or CE_Failure on failure.
    5946             :  * @since 3.10
    5947             :  */
    5948             : 
    5949           5 : CPLErr GDALRegenerateOverviewsMultiBand(
    5950             :     const std::vector<GDALRasterBand *> &apoSrcBands,
    5951             :     const std::vector<std::vector<GDALRasterBand *>> &aapoOverviewBands,
    5952             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    5953             :     void *pProgressData, CSLConstList papszOptions)
    5954             : {
    5955           5 :     CPLAssert(apoSrcBands.size() == aapoOverviewBands.size());
    5956          15 :     for (size_t i = 1; i < aapoOverviewBands.size(); ++i)
    5957             :     {
    5958          10 :         CPLAssert(aapoOverviewBands[i].size() == aapoOverviewBands[0].size());
    5959             :     }
    5960             : 
    5961           5 :     if (aapoOverviewBands.empty())
    5962           0 :         return CE_None;
    5963             : 
    5964           5 :     std::vector<GDALRasterBand **> apapoOverviewBands;
    5965          20 :     for (auto &apoOverviewBands : aapoOverviewBands)
    5966             :     {
    5967             :         auto papoOverviewBands = static_cast<GDALRasterBand **>(
    5968          15 :             CPLMalloc(apoOverviewBands.size() * sizeof(GDALRasterBand *)));
    5969          30 :         for (size_t i = 0; i < apoOverviewBands.size(); ++i)
    5970             :         {
    5971          15 :             papoOverviewBands[i] = apoOverviewBands[i];
    5972             :         }
    5973          15 :         apapoOverviewBands.push_back(papoOverviewBands);
    5974             :     }
    5975          10 :     const CPLErr eErr = GDALRegenerateOverviewsMultiBand(
    5976           5 :         static_cast<int>(apoSrcBands.size()), apoSrcBands.data(),
    5977           5 :         static_cast<int>(aapoOverviewBands[0].size()),
    5978           5 :         apapoOverviewBands.data(), pszResampling, pfnProgress, pProgressData,
    5979             :         papszOptions);
    5980          20 :     for (GDALRasterBand **papoOverviewBands : apapoOverviewBands)
    5981          15 :         CPLFree(papoOverviewBands);
    5982           5 :     return eErr;
    5983             : }
    5984             : 
    5985             : /************************************************************************/
    5986             : /*                        GDALComputeBandStats()                        */
    5987             : /************************************************************************/
    5988             : 
    5989             : /** Undocumented
    5990             :  * @param hSrcBand undocumented.
    5991             :  * @param nSampleStep Step between scanlines used to compute statistics.
    5992             :  *                    When nSampleStep is equal to 1, all scanlines will
    5993             :  *                    be processed.
    5994             :  * @param pdfMean undocumented.
    5995             :  * @param pdfStdDev undocumented.
    5996             :  * @param pfnProgress undocumented.
    5997             :  * @param pProgressData undocumented.
    5998             :  * @return undocumented
    5999             :  */
    6000          16 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
    6001             :                                         int nSampleStep, double *pdfMean,
    6002             :                                         double *pdfStdDev,
    6003             :                                         GDALProgressFunc pfnProgress,
    6004             :                                         void *pProgressData)
    6005             : 
    6006             : {
    6007          16 :     VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
    6008             : 
    6009          16 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    6010             : 
    6011          16 :     if (pfnProgress == nullptr)
    6012          16 :         pfnProgress = GDALDummyProgress;
    6013             : 
    6014          16 :     const int nWidth = poSrcBand->GetXSize();
    6015          16 :     const int nHeight = poSrcBand->GetYSize();
    6016             : 
    6017          16 :     if (nSampleStep >= nHeight || nSampleStep < 1)
    6018           3 :         nSampleStep = 1;
    6019             : 
    6020          16 :     GDALDataType eWrkType = GDT_Unknown;
    6021          16 :     float *pafData = nullptr;
    6022          16 :     GDALDataType eType = poSrcBand->GetRasterDataType();
    6023          16 :     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6024          16 :     if (bComplex)
    6025             :     {
    6026             :         pafData = static_cast<float *>(
    6027           0 :             VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
    6028           0 :         eWrkType = GDT_CFloat32;
    6029             :     }
    6030             :     else
    6031             :     {
    6032             :         pafData =
    6033          16 :             static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
    6034          16 :         eWrkType = GDT_Float32;
    6035             :     }
    6036             : 
    6037          16 :     if (nWidth == 0 || pafData == nullptr)
    6038             :     {
    6039           0 :         VSIFree(pafData);
    6040           0 :         return CE_Failure;
    6041             :     }
    6042             : 
    6043             :     /* -------------------------------------------------------------------- */
    6044             :     /*      Loop over all sample lines.                                     */
    6045             :     /* -------------------------------------------------------------------- */
    6046          16 :     double dfSum = 0.0;
    6047          16 :     double dfSum2 = 0.0;
    6048          16 :     int iLine = 0;
    6049          16 :     GIntBig nSamples = 0;
    6050             : 
    6051        2143 :     do
    6052             :     {
    6053        2159 :         if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6054             :                          pProgressData))
    6055             :         {
    6056           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6057           0 :             CPLFree(pafData);
    6058           0 :             return CE_Failure;
    6059             :         }
    6060             : 
    6061             :         const CPLErr eErr =
    6062        2159 :             poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
    6063             :                                 1, eWrkType, 0, 0, nullptr);
    6064        2159 :         if (eErr != CE_None)
    6065             :         {
    6066           1 :             CPLFree(pafData);
    6067           1 :             return eErr;
    6068             :         }
    6069             : 
    6070      725204 :         for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6071             :         {
    6072      723046 :             float fValue = 0.0f;
    6073             : 
    6074      723046 :             if (bComplex)
    6075             :             {
    6076             :                 // Compute the magnitude of the complex value.
    6077             :                 fValue =
    6078           0 :                     std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
    6079             :             }
    6080             :             else
    6081             :             {
    6082      723046 :                 fValue = pafData[iPixel];
    6083             :             }
    6084             : 
    6085      723046 :             dfSum += fValue;
    6086      723046 :             dfSum2 += static_cast<double>(fValue) * fValue;
    6087             :         }
    6088             : 
    6089        2158 :         nSamples += nWidth;
    6090        2158 :         iLine += nSampleStep;
    6091        2158 :     } while (iLine < nHeight);
    6092             : 
    6093          15 :     if (!pfnProgress(1.0, nullptr, pProgressData))
    6094             :     {
    6095           0 :         CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6096           0 :         CPLFree(pafData);
    6097           0 :         return CE_Failure;
    6098             :     }
    6099             : 
    6100             :     /* -------------------------------------------------------------------- */
    6101             :     /*      Produce the result values.                                      */
    6102             :     /* -------------------------------------------------------------------- */
    6103          15 :     if (pdfMean != nullptr)
    6104          15 :         *pdfMean = dfSum / nSamples;
    6105             : 
    6106          15 :     if (pdfStdDev != nullptr)
    6107             :     {
    6108          15 :         const double dfMean = dfSum / nSamples;
    6109             : 
    6110          15 :         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
    6111             :     }
    6112             : 
    6113          15 :     CPLFree(pafData);
    6114             : 
    6115          15 :     return CE_None;
    6116             : }
    6117             : 
    6118             : /************************************************************************/
    6119             : /*                  GDALOverviewMagnitudeCorrection()                   */
    6120             : /*                                                                      */
    6121             : /*      Correct the mean and standard deviation of the overviews of     */
    6122             : /*      the given band to match the base layer approximately.           */
    6123             : /************************************************************************/
    6124             : 
    6125             : /** Undocumented
    6126             :  * @param hBaseBand undocumented.
    6127             :  * @param nOverviewCount undocumented.
    6128             :  * @param pahOverviews undocumented.
    6129             :  * @param pfnProgress undocumented.
    6130             :  * @param pProgressData undocumented.
    6131             :  * @return undocumented
    6132             :  */
    6133           0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
    6134             :                                        int nOverviewCount,
    6135             :                                        GDALRasterBandH *pahOverviews,
    6136             :                                        GDALProgressFunc pfnProgress,
    6137             :                                        void *pProgressData)
    6138             : 
    6139             : {
    6140           0 :     VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
    6141             : 
    6142             :     /* -------------------------------------------------------------------- */
    6143             :     /*      Compute mean/stddev for source raster.                          */
    6144             :     /* -------------------------------------------------------------------- */
    6145           0 :     double dfOrigMean = 0.0;
    6146           0 :     double dfOrigStdDev = 0.0;
    6147             :     {
    6148             :         const CPLErr eErr =
    6149           0 :             GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
    6150             :                                  pfnProgress, pProgressData);
    6151             : 
    6152           0 :         if (eErr != CE_None)
    6153           0 :             return eErr;
    6154             :     }
    6155             : 
    6156             :     /* -------------------------------------------------------------------- */
    6157             :     /*      Loop on overview bands.                                         */
    6158             :     /* -------------------------------------------------------------------- */
    6159           0 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    6160             :     {
    6161             :         GDALRasterBand *poOverview =
    6162           0 :             GDALRasterBand::FromHandle(pahOverviews[iOverview]);
    6163             :         double dfOverviewMean, dfOverviewStdDev;
    6164             : 
    6165             :         const CPLErr eErr =
    6166           0 :             GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
    6167             :                                  &dfOverviewStdDev, pfnProgress, pProgressData);
    6168             : 
    6169           0 :         if (eErr != CE_None)
    6170           0 :             return eErr;
    6171             : 
    6172           0 :         double dfGain = 1.0;
    6173           0 :         if (dfOrigStdDev >= 0.0001)
    6174           0 :             dfGain = dfOrigStdDev / dfOverviewStdDev;
    6175             : 
    6176             :         /* --------------------------------------------------------------------
    6177             :          */
    6178             :         /*      Apply gain and offset. */
    6179             :         /* --------------------------------------------------------------------
    6180             :          */
    6181           0 :         const int nWidth = poOverview->GetXSize();
    6182           0 :         const int nHeight = poOverview->GetYSize();
    6183             : 
    6184           0 :         GDALDataType eWrkType = GDT_Unknown;
    6185           0 :         float *pafData = nullptr;
    6186           0 :         const GDALDataType eType = poOverview->GetRasterDataType();
    6187           0 :         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    6188           0 :         if (bComplex)
    6189             :         {
    6190             :             pafData = static_cast<float *>(
    6191           0 :                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    6192           0 :             eWrkType = GDT_CFloat32;
    6193             :         }
    6194             :         else
    6195             :         {
    6196             :             pafData = static_cast<float *>(
    6197           0 :                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    6198           0 :             eWrkType = GDT_Float32;
    6199             :         }
    6200             : 
    6201           0 :         if (pafData == nullptr)
    6202             :         {
    6203           0 :             return CE_Failure;
    6204             :         }
    6205             : 
    6206           0 :         for (int iLine = 0; iLine < nHeight; ++iLine)
    6207             :         {
    6208           0 :             if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    6209             :                              pProgressData))
    6210             :             {
    6211           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6212           0 :                 CPLFree(pafData);
    6213           0 :                 return CE_Failure;
    6214             :             }
    6215             : 
    6216           0 :             if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
    6217             :                                      nWidth, 1, eWrkType, 0, 0,
    6218           0 :                                      nullptr) != CE_None)
    6219             :             {
    6220           0 :                 CPLFree(pafData);
    6221           0 :                 return CE_Failure;
    6222             :             }
    6223             : 
    6224           0 :             for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    6225             :             {
    6226           0 :                 if (bComplex)
    6227             :                 {
    6228           0 :                     pafData[iPixel * 2] *= static_cast<float>(dfGain);
    6229           0 :                     pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
    6230             :                 }
    6231             :                 else
    6232             :                 {
    6233           0 :                     pafData[iPixel] = static_cast<float>(
    6234           0 :                         (pafData[iPixel] - dfOverviewMean) * dfGain +
    6235             :                         dfOrigMean);
    6236             :                 }
    6237             :             }
    6238             : 
    6239           0 :             if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
    6240             :                                      nWidth, 1, eWrkType, 0, 0,
    6241           0 :                                      nullptr) != CE_None)
    6242             :             {
    6243           0 :                 CPLFree(pafData);
    6244           0 :                 return CE_Failure;
    6245             :             }
    6246             :         }
    6247             : 
    6248           0 :         if (!pfnProgress(1.0, nullptr, pProgressData))
    6249             :         {
    6250           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    6251           0 :             CPLFree(pafData);
    6252           0 :             return CE_Failure;
    6253             :         }
    6254             : 
    6255           0 :         CPLFree(pafData);
    6256             :     }
    6257             : 
    6258           0 :     return CE_None;
    6259             : }

Generated by: LCOV version 1.14