LCOV - code coverage report
Current view: top level - gcore - overview.cpp (source / functions) Hit Total Coverage
Test: gdal_filtered.info Lines: 2172 2522 86.1 %
Date: 2024-05-03 15:49:35 Functions: 95 119 79.8 %

          Line data    Source code
       1             : 
       2             : /******************************************************************************
       3             :  *
       4             :  * Project:  GDAL Core
       5             :  * Purpose:  Helper code to implement overview support in different drivers.
       6             :  * Author:   Frank Warmerdam, warmerdam@pobox.com
       7             :  *
       8             :  ******************************************************************************
       9             :  * Copyright (c) 2000, Frank Warmerdam
      10             :  * Copyright (c) 2007-2010, Even Rouault <even dot rouault at spatialys.com>
      11             :  *
      12             :  * Permission is hereby granted, free of charge, to any person obtaining a
      13             :  * copy of this software and associated documentation files (the "Software"),
      14             :  * to deal in the Software without restriction, including without limitation
      15             :  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      16             :  * and/or sell copies of the Software, and to permit persons to whom the
      17             :  * Software is furnished to do so, subject to the following conditions:
      18             :  *
      19             :  * The above copyright notice and this permission notice shall be included
      20             :  * in all copies or substantial portions of the Software.
      21             :  *
      22             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
      23             :  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      24             :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
      25             :  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      26             :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      27             :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
      28             :  * DEALINGS IN THE SOFTWARE.
      29             :  ****************************************************************************/
      30             : 
      31             : #include "cpl_port.h"
      32             : #include "gdal_priv.h"
      33             : 
      34             : #include <cmath>
      35             : #include <cstddef>
      36             : #include <cstdlib>
      37             : 
      38             : #include <algorithm>
      39             : #include <complex>
      40             : #include <condition_variable>
      41             : #include <limits>
      42             : #include <list>
      43             : #include <memory>
      44             : #include <mutex>
      45             : #include <vector>
      46             : 
      47             : #include "cpl_conv.h"
      48             : #include "cpl_error.h"
      49             : #include "cpl_progress.h"
      50             : #include "cpl_vsi.h"
      51             : #include "gdal.h"
      52             : #include "gdal_thread_pool.h"
      53             : #include "gdalwarper.h"
      54             : 
      55             : // Restrict to 64bit processors because they are guaranteed to have SSE2,
      56             : // or if __AVX2__ is defined.
      57             : #if defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
      58             : #define USE_SSE2
      59             : 
      60             : #include "gdalsse_priv.h"
      61             : 
      62             : #ifdef __SSE3__
      63             : #include <pmmintrin.h>
      64             : #endif
      65             : #ifdef __SSSE3__
      66             : #include <tmmintrin.h>
      67             : #endif
      68             : #ifdef __SSE4_1__
      69             : #include <smmintrin.h>
      70             : #endif
      71             : #ifdef __AVX2__
      72             : #include <immintrin.h>
      73             : #endif
      74             : 
      75             : #endif
      76             : 
      77             : // To be included after above USE_SSE2 and include gdalsse_priv.h
      78             : // to avoid build issue on Windows x86
      79             : #include "gdal_priv_templates.hpp"
      80             : 
      81             : /************************************************************************/
      82             : /*                      GDALResampleChunk_Near()                        */
      83             : /************************************************************************/
      84             : 
      85             : template <class T>
      86        1062 : static CPLErr GDALResampleChunk_NearT(
      87             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, GDALDataType eWrkDataType,
      88             :     const T *pChunk, int nChunkXOff, int nChunkXSize, int nChunkYOff,
      89             :     int nDstXOff, int nDstXOff2, int nDstYOff, int nDstYOff2, T **ppDstBuffer)
      90             : 
      91             : {
      92        1062 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
      93             : 
      94             :     /* -------------------------------------------------------------------- */
      95             :     /*      Allocate buffers.                                               */
      96             :     /* -------------------------------------------------------------------- */
      97        1062 :     *ppDstBuffer = static_cast<T *>(
      98        1062 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
      99             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
     100        1062 :     if (*ppDstBuffer == nullptr)
     101             :     {
     102           0 :         return CE_Failure;
     103             :     }
     104        1062 :     T *const pDstBuffer = *ppDstBuffer;
     105             : 
     106             :     int *panSrcXOff =
     107        1062 :         static_cast<int *>(VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(int)));
     108             : 
     109        1062 :     if (panSrcXOff == nullptr)
     110             :     {
     111           0 :         VSIFree(panSrcXOff);
     112           0 :         return CE_Failure;
     113             :     }
     114             : 
     115             :     /* ==================================================================== */
     116             :     /*      Precompute inner loop constants.                                */
     117             :     /* ==================================================================== */
     118      500226 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
     119             :     {
     120      499164 :         int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
     121      499164 :         if (nSrcXOff < nChunkXOff)
     122           0 :             nSrcXOff = nChunkXOff;
     123             : 
     124      499164 :         panSrcXOff[iDstPixel - nDstXOff] = nSrcXOff;
     125             :     }
     126             : 
     127             :     /* ==================================================================== */
     128             :     /*      Loop over destination scanlines.                                */
     129             :     /* ==================================================================== */
     130      136491 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
     131             :     {
     132      135429 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
     133      135429 :         if (nSrcYOff < nChunkYOff)
     134           0 :             nSrcYOff = nChunkYOff;
     135             : 
     136      135429 :         const T *const pSrcScanline =
     137             :             pChunk +
     138      135429 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize) -
     139      133063 :             nChunkXOff;
     140             : 
     141             :         /* --------------------------------------------------------------------
     142             :          */
     143             :         /*      Loop over destination pixels */
     144             :         /* --------------------------------------------------------------------
     145             :          */
     146      135429 :         T *pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
     147   116177106 :         for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
     148             :         {
     149   116041740 :             pDstScanline[iDstPixel] = pSrcScanline[panSrcXOff[iDstPixel]];
     150             :         }
     151             :     }
     152             : 
     153        1062 :     CPLFree(panSrcXOff);
     154             : 
     155        1062 :     return CE_None;
     156             : }
     157             : 
     158        1062 : static CPLErr GDALResampleChunk_Near(
     159             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, double /* dfSrcXDelta */,
     160             :     double /* dfSrcYDelta */, GDALDataType eWrkDataType, const void *pChunk,
     161             :     const GByte * /* pabyChunkNodataMask_unused */, int nChunkXOff,
     162             :     int nChunkXSize, int nChunkYOff, int /* nChunkYSize */, int nDstXOff,
     163             :     int nDstXOff2, int nDstYOff, int nDstYOff2, GDALRasterBand * /*poOverview*/,
     164             :     void **ppDstBuffer, GDALDataType *peDstBufferDataType,
     165             :     const char * /* pszResampling_unused */, bool /* bHasNoData_unused */,
     166             :     double /* fNoDataValue_unused */,
     167             :     GDALColorTable * /* poColorTable_unused */, GDALDataType /* eSrcDataType */,
     168             :     bool /* bPropagateNoData */)
     169             : {
     170        1062 :     *peDstBufferDataType = eWrkDataType;
     171        1062 :     if (eWrkDataType == GDT_Byte)
     172         966 :         return GDALResampleChunk_NearT(
     173             :             dfXRatioDstToSrc, dfYRatioDstToSrc, eWrkDataType,
     174             :             static_cast<const GByte *>(pChunk), nChunkXOff, nChunkXSize,
     175             :             nChunkYOff, nDstXOff, nDstXOff2, nDstYOff, nDstYOff2,
     176         966 :             reinterpret_cast<GByte **>(ppDstBuffer));
     177          96 :     else if (eWrkDataType == GDT_UInt16)
     178           5 :         return GDALResampleChunk_NearT(
     179             :             dfXRatioDstToSrc, dfYRatioDstToSrc, eWrkDataType,
     180             :             static_cast<const GInt16 *>(pChunk), nChunkXOff, nChunkXSize,
     181             :             nChunkYOff, nDstXOff, nDstXOff2, nDstYOff, nDstYOff2,
     182           5 :             reinterpret_cast<GInt16 **>(ppDstBuffer));
     183          91 :     else if (eWrkDataType == GDT_Float32)
     184          64 :         return GDALResampleChunk_NearT(
     185             :             dfXRatioDstToSrc, dfYRatioDstToSrc, eWrkDataType,
     186             :             static_cast<const float *>(pChunk), nChunkXOff, nChunkXSize,
     187             :             nChunkYOff, nDstXOff, nDstXOff2, nDstYOff, nDstYOff2,
     188          64 :             reinterpret_cast<float **>(ppDstBuffer));
     189          27 :     else if (eWrkDataType == GDT_Float64)
     190          27 :         return GDALResampleChunk_NearT(
     191             :             dfXRatioDstToSrc, dfYRatioDstToSrc, eWrkDataType,
     192             :             static_cast<const double *>(pChunk), nChunkXOff, nChunkXSize,
     193             :             nChunkYOff, nDstXOff, nDstXOff2, nDstYOff, nDstYOff2,
     194          27 :             reinterpret_cast<double **>(ppDstBuffer));
     195             : 
     196           0 :     CPLAssert(false);
     197             :     return CE_Failure;
     198             : }
     199             : 
     200             : namespace
     201             : {
     202             : 
     203             : // Find in the color table the entry whose RGB value is the closest
     204             : // (using quadratic distance) to the test color, ignoring transparent entries.
     205        3837 : int BestColorEntry(const std::vector<GDALColorEntry> &entries,
     206             :                    const GDALColorEntry &test)
     207             : {
     208        3837 :     int nMinDist = std::numeric_limits<int>::max();
     209        3837 :     size_t bestEntry = 0;
     210      986109 :     for (size_t i = 0; i < entries.size(); ++i)
     211             :     {
     212      982272 :         const GDALColorEntry &entry = entries[i];
     213             :         // Ignore transparent entries
     214      982272 :         if (entry.c4 == 0)
     215        3237 :             continue;
     216             : 
     217      979035 :         int nDist = ((test.c1 - entry.c1) * (test.c1 - entry.c1)) +
     218      979035 :                     ((test.c2 - entry.c2) * (test.c2 - entry.c2)) +
     219      979035 :                     ((test.c3 - entry.c3) * (test.c3 - entry.c3));
     220      979035 :         if (nDist < nMinDist)
     221             :         {
     222       15847 :             nMinDist = nDist;
     223       15847 :             bestEntry = i;
     224             :         }
     225             :     }
     226        3837 :     return static_cast<int>(bestEntry);
     227             : }
     228             : 
     229           7 : std::vector<GDALColorEntry> ReadColorTable(const GDALColorTable &table,
     230             :                                            int &transparentIdx)
     231             : {
     232           7 :     std::vector<GDALColorEntry> entries(table.GetColorEntryCount());
     233             : 
     234           7 :     transparentIdx = -1;
     235           7 :     int i = 0;
     236        1799 :     for (auto &entry : entries)
     237             :     {
     238        1792 :         table.GetColorEntryAsRGB(i, &entry);
     239        1792 :         if (transparentIdx < 0 && entry.c4 == 0)
     240           1 :             transparentIdx = i;
     241        1792 :         ++i;
     242             :     }
     243           7 :     return entries;
     244             : }
     245             : 
     246             : }  // unnamed  namespace
     247             : 
     248             : /************************************************************************/
     249             : /*                             SQUARE()                                 */
     250             : /************************************************************************/
     251             : 
     252        3721 : template <class T, class Tsquare = T> inline Tsquare SQUARE(T val)
     253             : {
     254        3721 :     return static_cast<Tsquare>(val) * val;
     255             : }
     256             : 
     257             : /************************************************************************/
     258             : /*                          ComputeIntegerRMS()                         */
     259             : /************************************************************************/
     260             : // Compute rms = sqrt(sumSquares / weight) in such a way that it is the
     261             : // integer that minimizes abs(rms**2 - sumSquares / weight)
     262             : template <class T, class Twork>
     263          42 : inline T ComputeIntegerRMS(double sumSquares, double weight)
     264             : {
     265          42 :     const double sumDivWeight = sumSquares / weight;
     266          42 :     T rms = static_cast<T>(sqrt(sumDivWeight));
     267             : 
     268             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     269             :     // Naive version:
     270             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     271          42 :     if (static_cast<double>(static_cast<Twork>(2) * rms * (rms + 1) + 1) <
     272          42 :         2 * sumDivWeight)
     273           6 :         rms += 1;
     274          42 :     return rms;
     275             : }
     276             : 
     277           0 : template <class T, class Tsum> inline T ComputeIntegerRMS_4values(Tsum)
     278             : {
     279           0 :     CPLAssert(false);
     280             :     return 0;
     281             : }
     282             : 
     283          24 : template <> inline GByte ComputeIntegerRMS_4values<GByte, int>(int sumSquares)
     284             : {
     285             :     // It has been verified that given the correction on rms below, using
     286             :     // sqrt((float)((sumSquares + 1)/ 4)) or sqrt((float)sumSquares * 0.25f)
     287             :     // is equivalent, so use the former as it is used twice.
     288          24 :     const int sumSquaresPlusOneDiv4 = (sumSquares + 1) / 4;
     289          24 :     const float sumDivWeight = static_cast<float>(sumSquaresPlusOneDiv4);
     290          24 :     GByte rms = static_cast<GByte>(std::sqrt(sumDivWeight));
     291             : 
     292             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     293             :     // Naive version:
     294             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     295             :     // Optimized version for integer case and weight == 4
     296          24 :     if (static_cast<int>(rms) * (rms + 1) < sumSquaresPlusOneDiv4)
     297           5 :         rms += 1;
     298          24 :     return rms;
     299             : }
     300             : 
     301             : template <>
     302          20 : inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
     303             : {
     304          20 :     const double sumDivWeight = sumSquares * 0.25;
     305          20 :     GUInt16 rms = static_cast<GUInt16>(std::sqrt(sumDivWeight));
     306             : 
     307             :     // Is rms**2 or (rms+1)**2 closest to sumSquares / weight ?
     308             :     // Naive version:
     309             :     // if( weight * (rms+1)**2 - sumSquares < sumSquares - weight * rms**2 )
     310             :     // Optimized version for integer case and weight == 4
     311          20 :     if (static_cast<GUInt32>(rms) * (rms + 1) <
     312          20 :         static_cast<GUInt32>(sumDivWeight + 0.25))
     313           4 :         rms += 1;
     314          20 :     return rms;
     315             : }
     316             : 
     317             : #ifdef USE_SSE2
     318             : 
     319             : /************************************************************************/
     320             : /*                   QuadraticMeanByteSSE2OrAVX2()                      */
     321             : /************************************************************************/
     322             : 
     323             : #ifdef __SSE4_1__
     324             : #define sse2_packus_epi32 _mm_packus_epi32
     325             : #else
     326      516119 : inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
     327             : {
     328      516119 :     const auto minus32768_32 = _mm_set1_epi32(-32768);
     329      516119 :     const auto minus32768_16 = _mm_set1_epi16(-32768);
     330      516119 :     a = _mm_add_epi32(a, minus32768_32);
     331      516119 :     b = _mm_add_epi32(b, minus32768_32);
     332      516119 :     a = _mm_packs_epi32(a, b);
     333      516119 :     a = _mm_sub_epi16(a, minus32768_16);
     334      516119 :     return a;
     335             : }
     336             : #endif
     337             : 
     338             : #ifdef __SSSE3__
     339             : #define sse2_hadd_epi16 _mm_hadd_epi16
     340             : #else
     341     4660650 : inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
     342             : {
     343             :     // Horizontal addition of adjacent pairs
     344     4660650 :     const auto mask = _mm_set1_epi32(0xFFFF);
     345             :     const auto horizLo =
     346    13982000 :         _mm_add_epi32(_mm_and_si128(a, mask), _mm_srli_epi32(a, 16));
     347             :     const auto horizHi =
     348    13982000 :         _mm_add_epi32(_mm_and_si128(b, mask), _mm_srli_epi32(b, 16));
     349             : 
     350             :     // Recombine low and high parts
     351     4660650 :     return _mm_packs_epi32(horizLo, horizHi);
     352             : }
     353             : #endif
     354             : 
     355             : #ifdef __AVX2__
     356             : 
     357             : #define DEST_ELTS 16
     358             : #define set1_epi16 _mm256_set1_epi16
     359             : #define set1_epi32 _mm256_set1_epi32
     360             : #define setzero _mm256_setzero_si256
     361             : #define set1_ps _mm256_set1_ps
     362             : #define loadu_int(x) _mm256_loadu_si256(reinterpret_cast<__m256i const *>(x))
     363             : #define unpacklo_epi8 _mm256_unpacklo_epi8
     364             : #define unpackhi_epi8 _mm256_unpackhi_epi8
     365             : #define madd_epi16 _mm256_madd_epi16
     366             : #define add_epi32 _mm256_add_epi32
     367             : #define mul_ps _mm256_mul_ps
     368             : #define cvtepi32_ps _mm256_cvtepi32_ps
     369             : #define sqrt_ps _mm256_sqrt_ps
     370             : #define cvttps_epi32 _mm256_cvttps_epi32
     371             : #define packs_epi32 _mm256_packs_epi32
     372             : #define packus_epi32 _mm256_packus_epi32
     373             : #define srli_epi32 _mm256_srli_epi32
     374             : #define mullo_epi16 _mm256_mullo_epi16
     375             : #define srli_epi16 _mm256_srli_epi16
     376             : #define cmpgt_epi16 _mm256_cmpgt_epi16
     377             : #define add_epi16 _mm256_add_epi16
     378             : #define sub_epi16 _mm256_sub_epi16
     379             : #define packus_epi16 _mm256_packus_epi16
     380             : /* AVX2 operates on 2 separate 128-bit lanes, so we have to do shuffling */
     381             : /* to get the lower 128-bit bits of what would be a true 256-bit vector register
     382             :  */
     383             : #define store_lo(x, y)                                                         \
     384             :     _mm_storeu_si128(reinterpret_cast<__m128i *>(x),                           \
     385             :                      _mm256_extracti128_si256(                                 \
     386             :                          _mm256_permute4x64_epi64((y), 0 | (2 << 2)), 0))
     387             : #define hadd_epi16 _mm256_hadd_epi16
     388             : #define zeroupper() _mm256_zeroupper()
     389             : #else
     390             : #define DEST_ELTS 8
     391             : #define set1_epi16 _mm_set1_epi16
     392             : #define set1_epi32 _mm_set1_epi32
     393             : #define setzero _mm_setzero_si128
     394             : #define set1_ps _mm_set1_ps
     395             : #define loadu_int(x) _mm_loadu_si128(reinterpret_cast<__m128i const *>(x))
     396             : #define unpacklo_epi8 _mm_unpacklo_epi8
     397             : #define unpackhi_epi8 _mm_unpackhi_epi8
     398             : #define madd_epi16 _mm_madd_epi16
     399             : #define add_epi32 _mm_add_epi32
     400             : #define mul_ps _mm_mul_ps
     401             : #define cvtepi32_ps _mm_cvtepi32_ps
     402             : #define sqrt_ps _mm_sqrt_ps
     403             : #define cvttps_epi32 _mm_cvttps_epi32
     404             : #define packs_epi32 _mm_packs_epi32
     405             : #define packus_epi32 sse2_packus_epi32
     406             : #define srli_epi32 _mm_srli_epi32
     407             : #define mullo_epi16 _mm_mullo_epi16
     408             : #define srli_epi16 _mm_srli_epi16
     409             : #define cmpgt_epi16 _mm_cmpgt_epi16
     410             : #define add_epi16 _mm_add_epi16
     411             : #define sub_epi16 _mm_sub_epi16
     412             : #define packus_epi16 _mm_packus_epi16
     413             : #define store_lo(x, y) _mm_storel_epi64(reinterpret_cast<__m128i *>(x), (y))
     414             : #define hadd_epi16 sse2_hadd_epi16
     415             : #define zeroupper() (void)0
     416             : #endif
     417             : 
     418             : #if defined(__GNUC__) && defined(__AVX2__)
     419             : // Disabling inlining works around a bug with gcc 9.3 (Ubuntu 20.04) in
     420             : // -O2 -mavx2 mode in QuadraticMeanFloatSSE2(),
     421             : // where the registry that contains minus_zero is correctly
     422             : // loaded the first time the function is called (looking at the disassembly,
     423             : // one sees it is loaded much earlier than the function), but gets corrupted
     424             : // (zeroed) in following iterations.
     425             : // It appears the bug is due to the explicit zeroupper() call at the end of
     426             : // the function.
     427             : // The bug is at least solved in gcc 10.2.
     428             : // Inlining doesn't bring much here to performance.
     429             : // This is also needed with gcc 9.3 on QuadraticMeanByteSSE2OrAVX2() in
     430             : // -O3 -mavx2 mode
     431             : #define NOINLINE __attribute__((noinline))
     432             : #else
     433             : #define NOINLINE
     434             : #endif
     435             : 
     436             : template <class T>
     437             : static int NOINLINE
     438        5385 : QuadraticMeanByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     439             :                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     440             :                             T *CPL_RESTRICT pDstScanline)
     441             : {
     442             :     // Optimized implementation for RMS on Byte by
     443             :     // processing by group of 8 output pixels, so as to use
     444             :     // a single _mm_sqrt_ps() call for 4 output pixels
     445        5385 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     446             : 
     447        5385 :     int iDstPixel = 0;
     448        5385 :     const auto one16 = set1_epi16(1);
     449        5385 :     const auto one32 = set1_epi32(1);
     450        5385 :     const auto zero = setzero();
     451        5385 :     const auto minus32768 = set1_epi16(-32768);
     452             : 
     453      521496 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     454             :     {
     455             :         // Load 2 * DEST_ELTS bytes from each line
     456      516111 :         auto firstLine = loadu_int(pSrcScanlineShifted);
     457     1032220 :         auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     458             :         // Extend those Bytes as UInt16s
     459      516111 :         auto firstLineLo = unpacklo_epi8(firstLine, zero);
     460      516111 :         auto firstLineHi = unpackhi_epi8(firstLine, zero);
     461      516111 :         auto secondLineLo = unpacklo_epi8(secondLine, zero);
     462      516111 :         auto secondLineHi = unpackhi_epi8(secondLine, zero);
     463             : 
     464             :         // Multiplication of 16 bit values and horizontal
     465             :         // addition of 32 bit results
     466             :         // [ src[2*i+0]^2 + src[2*i+1]^2 for i in range(4) ]
     467      516111 :         firstLineLo = madd_epi16(firstLineLo, firstLineLo);
     468      516111 :         firstLineHi = madd_epi16(firstLineHi, firstLineHi);
     469      516111 :         secondLineLo = madd_epi16(secondLineLo, secondLineLo);
     470      516111 :         secondLineHi = madd_epi16(secondLineHi, secondLineHi);
     471             : 
     472             :         // Vertical addition
     473      516111 :         const auto sumSquaresLo = add_epi32(firstLineLo, secondLineLo);
     474      516111 :         const auto sumSquaresHi = add_epi32(firstLineHi, secondLineHi);
     475             : 
     476             :         const auto sumSquaresPlusOneDiv4Lo =
     477     1032220 :             srli_epi32(add_epi32(sumSquaresLo, one32), 2);
     478             :         const auto sumSquaresPlusOneDiv4Hi =
     479     1032220 :             srli_epi32(add_epi32(sumSquaresHi, one32), 2);
     480             : 
     481             :         // Take square root and truncate/floor to int32
     482             :         const auto rmsLo =
     483     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Lo)));
     484             :         const auto rmsHi =
     485     1548330 :             cvttps_epi32(sqrt_ps(cvtepi32_ps(sumSquaresPlusOneDiv4Hi)));
     486             : 
     487             :         // Merge back low and high registers with each RMS value
     488             :         // as a 16 bit value.
     489      516111 :         auto rms = packs_epi32(rmsLo, rmsHi);
     490             : 
     491             :         // Round to upper value if it minimizes the
     492             :         // error |rms^2 - sumSquares/4|
     493             :         // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     494             :         //    rms += 1;
     495             :         // which is equivalent to:
     496             :         // if( rms * (rms + 1) < (sumSquares+1) / 4 )
     497             :         //    rms += 1;
     498             :         // And both left and right parts fit on 16 (unsigned) bits
     499             :         const auto sumSquaresPlusOneDiv4 =
     500      516111 :             packus_epi32(sumSquaresPlusOneDiv4Lo, sumSquaresPlusOneDiv4Hi);
     501             :         // cmpgt_epi16 operates on signed int16, but here
     502             :         // we have unsigned values, so shift them by -32768 before
     503     2580560 :         auto mask = cmpgt_epi16(
     504             :             add_epi16(sumSquaresPlusOneDiv4, minus32768),
     505             :             add_epi16(mullo_epi16(rms, add_epi16(rms, one16)), minus32768));
     506             :         // The value of the mask will be -1 when the correction needs to be
     507             :         // applied
     508      516111 :         rms = sub_epi16(rms, mask);
     509             : 
     510             :         // Pack each 16 bit RMS value to 8 bits
     511      516111 :         rms = packus_epi16(rms, rms /* could be anything */);
     512      516111 :         store_lo(&pDstScanline[iDstPixel], rms);
     513      516111 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     514             :     }
     515             :     zeroupper();
     516             : 
     517        5385 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     518        5385 :     return iDstPixel;
     519             : }
     520             : 
     521             : /************************************************************************/
     522             : /*                      AverageByteSSE2OrAVX2()                         */
     523             : /************************************************************************/
     524             : 
     525             : template <class T>
     526             : static int
     527      110946 : AverageByteSSE2OrAVX2(int nDstXWidth, int nChunkXSize,
     528             :                       const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     529             :                       T *CPL_RESTRICT pDstScanline)
     530             : {
     531             :     // Optimized implementation for average on Byte by
     532             :     // processing by group of 8 output pixels.
     533             : 
     534      110946 :     const auto zero = setzero();
     535      110946 :     const auto two16 = set1_epi16(2);
     536      110946 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     537             : 
     538      110946 :     int iDstPixel = 0;
     539     4771600 :     for (; iDstPixel < nDstXWidth - (DEST_ELTS - 1); iDstPixel += DEST_ELTS)
     540             :     {
     541             :         // Load 2 * DEST_ELTS bytes from each line
     542     4660650 :         const auto firstLine = loadu_int(pSrcScanlineShifted);
     543     9321310 :         const auto secondLine = loadu_int(pSrcScanlineShifted + nChunkXSize);
     544             :         // Extend those Bytes as UInt16s
     545     4660650 :         const auto firstLineLo = unpacklo_epi8(firstLine, zero);
     546     4660650 :         const auto firstLineHi = unpackhi_epi8(firstLine, zero);
     547     4660650 :         const auto secondLineLo = unpacklo_epi8(secondLine, zero);
     548     4660650 :         const auto secondLineHi = unpackhi_epi8(secondLine, zero);
     549             : 
     550             :         // Vertical addition
     551     4660650 :         const auto sumLo = add_epi16(firstLineLo, secondLineLo);
     552     4660650 :         const auto sumHi = add_epi16(firstLineHi, secondLineHi);
     553             : 
     554             :         // Horizontal addition of adjacent pairs, and recombine low and high
     555             :         // parts
     556     4660650 :         const auto sum = hadd_epi16(sumLo, sumHi);
     557             : 
     558             :         // average = (sum + 2) / 4
     559     9321310 :         auto average = srli_epi16(add_epi16(sum, two16), 2);
     560             : 
     561             :         // Pack each 16 bit average value to 8 bits
     562     4660650 :         average = packus_epi16(average, average /* could be anything */);
     563     4660650 :         store_lo(&pDstScanline[iDstPixel], average);
     564     4660650 :         pSrcScanlineShifted += 2 * DEST_ELTS;
     565             :     }
     566             :     zeroupper();
     567             : 
     568      110946 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     569      110946 :     return iDstPixel;
     570             : }
     571             : 
     572             : /************************************************************************/
     573             : /*                     QuadraticMeanUInt16SSE2()                        */
     574             : /************************************************************************/
     575             : 
     576             : #ifdef __SSE3__
     577             : #define sse2_hadd_pd _mm_hadd_pd
     578             : #else
     579           8 : inline __m128d sse2_hadd_pd(__m128d a, __m128d b)
     580             : {
     581             :     auto aLo_bLo =
     582          32 :         _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(a), _mm_castpd_ps(b)));
     583             :     auto aHi_bHi =
     584          32 :         _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(b), _mm_castpd_ps(a)));
     585           8 :     return _mm_add_pd(aLo_bLo, aHi_bHi);  // (aLo + aHi, bLo + bHi)
     586             : }
     587             : #endif
     588             : 
     589          40 : inline __m128d SQUARE(__m128d x)
     590             : {
     591          40 :     return _mm_mul_pd(x, x);
     592             : }
     593             : 
     594             : #ifdef __AVX2__
     595             : 
     596             : inline __m256d SQUARE(__m256d x)
     597             : {
     598             :     return _mm256_mul_pd(x, x);
     599             : }
     600             : 
     601             : inline __m256d FIXUP_LANES(__m256d x)
     602             : {
     603             :     return _mm256_permute4x64_pd(x, _MM_SHUFFLE(3, 1, 2, 0));
     604             : }
     605             : 
     606             : inline __m256 FIXUP_LANES(__m256 x)
     607             : {
     608             :     return _mm256_castpd_ps(FIXUP_LANES(_mm256_castps_pd(x)));
     609             : }
     610             : 
     611             : #endif
     612             : 
     613             : template <class T>
     614             : static int
     615          10 : QuadraticMeanUInt16SSE2(int nDstXWidth, int nChunkXSize,
     616             :                         const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     617             :                         T *CPL_RESTRICT pDstScanline)
     618             : {
     619             :     // Optimized implementation for RMS on UInt16 by
     620             :     // processing by group of 4 output pixels.
     621          10 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     622             : 
     623          10 :     int iDstPixel = 0;
     624          10 :     const auto zero = _mm_setzero_si128();
     625             : 
     626             : #ifdef __AVX2__
     627             :     const auto zeroDot25 = _mm256_set1_pd(0.25);
     628             :     const auto zeroDot5 = _mm256_set1_pd(0.5);
     629             : 
     630             :     // The first four 0's could be anything, as we only take the bottom
     631             :     // 128 bits.
     632             :     const auto permutation = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
     633             : #else
     634          10 :     const auto zeroDot25 = _mm_set1_pd(0.25);
     635          10 :     const auto zeroDot5 = _mm_set1_pd(0.5);
     636             : #endif
     637             : 
     638          40 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
     639             :     {
     640             :         // Load 8 UInt16 from each line
     641          30 :         const auto firstLine = _mm_loadu_si128(
     642             :             reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     643             :         const auto secondLine =
     644          30 :             _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     645          30 :                 pSrcScanlineShifted + nChunkXSize));
     646             : 
     647             :         // Detect if all of the source values fit in 14 bits.
     648             :         // because if x < 2^14, then 4 * x^2 < 2^30 which fits in a signed int32
     649             :         // and we can do a much faster implementation.
     650             :         const auto maskTmp =
     651          60 :             _mm_srli_epi16(_mm_or_si128(firstLine, secondLine), 14);
     652             : #if defined(__i386__) || defined(_M_IX86)
     653             :         uint64_t nMaskFitsIn14Bits = 0;
     654             :         _mm_storel_epi64(
     655             :             reinterpret_cast<__m128i *>(&nMaskFitsIn14Bits),
     656             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     657             : #else
     658          30 :         const auto nMaskFitsIn14Bits = _mm_cvtsi128_si64(
     659             :             _mm_packus_epi16(maskTmp, maskTmp /* could be anything */));
     660             : #endif
     661          30 :         if (nMaskFitsIn14Bits == 0)
     662             :         {
     663             :             // Multiplication of 16 bit values and horizontal
     664             :             // addition of 32 bit results
     665             :             const auto firstLineHSumSquare =
     666          26 :                 _mm_madd_epi16(firstLine, firstLine);
     667             :             const auto secondLineHSumSquare =
     668          26 :                 _mm_madd_epi16(secondLine, secondLine);
     669             :             // Vertical addition
     670             :             const auto sumSquares =
     671          26 :                 _mm_add_epi32(firstLineHSumSquare, secondLineHSumSquare);
     672             :             // In theory we should take sqrt(sumSquares * 0.25f)
     673             :             // but given the rounding we do, this is equivalent to
     674             :             // sqrt((sumSquares + 1)/4). This has been verified exhaustively for
     675             :             // sumSquares <= 4 * 16383^2
     676          26 :             const auto one32 = _mm_set1_epi32(1);
     677             :             const auto sumSquaresPlusOneDiv4 =
     678          52 :                 _mm_srli_epi32(_mm_add_epi32(sumSquares, one32), 2);
     679             :             // Take square root and truncate/floor to int32
     680          78 :             auto rms = _mm_cvttps_epi32(
     681             :                 _mm_sqrt_ps(_mm_cvtepi32_ps(sumSquaresPlusOneDiv4)));
     682             : 
     683             :             // Round to upper value if it minimizes the
     684             :             // error |rms^2 - sumSquares/4|
     685             :             // if( 2 * (2 * rms * (rms + 1) + 1) < sumSquares )
     686             :             //    rms += 1;
     687             :             // which is equivalent to:
     688             :             // if( rms * rms + rms < (sumSquares+1) / 4 )
     689             :             //    rms += 1;
     690             :             auto mask =
     691          78 :                 _mm_cmpgt_epi32(sumSquaresPlusOneDiv4,
     692             :                                 _mm_add_epi32(_mm_madd_epi16(rms, rms), rms));
     693          26 :             rms = _mm_sub_epi32(rms, mask);
     694             :             // Pack each 32 bit RMS value to 16 bits
     695          26 :             rms = _mm_packs_epi32(rms, rms /* could be anything */);
     696             :             _mm_storel_epi64(
     697          26 :                 reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]), rms);
     698          26 :             pSrcScanlineShifted += 8;
     699          26 :             continue;
     700             :         }
     701             : 
     702             :         // An approach using _mm_mullo_epi16, _mm_mulhi_epu16 before extending
     703             :         // to 32 bit would result in 4 multiplications instead of 8, but
     704             :         // mullo/mulhi have a worse throughput than mul_pd.
     705             : 
     706             :         // Extend those UInt16s as UInt32s
     707           4 :         const auto firstLineLo = _mm_unpacklo_epi16(firstLine, zero);
     708           4 :         const auto firstLineHi = _mm_unpackhi_epi16(firstLine, zero);
     709           4 :         const auto secondLineLo = _mm_unpacklo_epi16(secondLine, zero);
     710           4 :         const auto secondLineHi = _mm_unpackhi_epi16(secondLine, zero);
     711             : 
     712             : #ifdef __AVX2__
     713             :         // Multiplication of 32 bit values previously converted to 64 bit double
     714             :         const auto firstLineLoDbl = SQUARE(_mm256_cvtepi32_pd(firstLineLo));
     715             :         const auto firstLineHiDbl = SQUARE(_mm256_cvtepi32_pd(firstLineHi));
     716             :         const auto secondLineLoDbl = SQUARE(_mm256_cvtepi32_pd(secondLineLo));
     717             :         const auto secondLineHiDbl = SQUARE(_mm256_cvtepi32_pd(secondLineHi));
     718             : 
     719             :         // Vertical addition of squares
     720             :         const auto sumSquaresLo =
     721             :             _mm256_add_pd(firstLineLoDbl, secondLineLoDbl);
     722             :         const auto sumSquaresHi =
     723             :             _mm256_add_pd(firstLineHiDbl, secondLineHiDbl);
     724             : 
     725             :         // Horizontal addition of squares
     726             :         const auto sumSquares =
     727             :             FIXUP_LANES(_mm256_hadd_pd(sumSquaresLo, sumSquaresHi));
     728             : 
     729             :         const auto sumDivWeight = _mm256_mul_pd(sumSquares, zeroDot25);
     730             : 
     731             :         // Take square root and truncate/floor to int32
     732             :         auto rms = _mm256_cvttpd_epi32(_mm256_sqrt_pd(sumDivWeight));
     733             :         const auto rmsDouble = _mm256_cvtepi32_pd(rms);
     734             :         const auto right = _mm256_sub_pd(
     735             :             sumDivWeight, _mm256_add_pd(SQUARE(rmsDouble), rmsDouble));
     736             : 
     737             :         auto mask =
     738             :             _mm256_castpd_ps(_mm256_cmp_pd(zeroDot5, right, _CMP_LT_OS));
     739             :         // Extract 32-bit from each of the 4 64-bit masks
     740             :         // mask = FIXUP_LANES(_mm256_shuffle_ps(mask, mask,
     741             :         // _MM_SHUFFLE(2,0,2,0)));
     742             :         mask = _mm256_permutevar8x32_ps(mask, permutation);
     743             :         const auto maskI = _mm_castps_si128(_mm256_extractf128_ps(mask, 0));
     744             : 
     745             :         // Apply the correction
     746             :         rms = _mm_sub_epi32(rms, maskI);
     747             : 
     748             :         // Pack each 32 bit RMS value to 16 bits
     749             :         rms = _mm_packus_epi32(rms, rms /* could be anything */);
     750             : #else
     751             :         // Multiplication of 32 bit values previously converted to 64 bit double
     752           4 :         const auto firstLineLoLo = SQUARE(_mm_cvtepi32_pd(firstLineLo));
     753             :         const auto firstLineLoHi =
     754           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineLo, 8)));
     755           4 :         const auto firstLineHiLo = SQUARE(_mm_cvtepi32_pd(firstLineHi));
     756             :         const auto firstLineHiHi =
     757           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(firstLineHi, 8)));
     758             : 
     759           4 :         const auto secondLineLoLo = SQUARE(_mm_cvtepi32_pd(secondLineLo));
     760             :         const auto secondLineLoHi =
     761           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineLo, 8)));
     762           4 :         const auto secondLineHiLo = SQUARE(_mm_cvtepi32_pd(secondLineHi));
     763             :         const auto secondLineHiHi =
     764           8 :             SQUARE(_mm_cvtepi32_pd(_mm_srli_si128(secondLineHi, 8)));
     765             : 
     766             :         // Vertical addition of squares
     767           4 :         const auto sumSquaresLoLo = _mm_add_pd(firstLineLoLo, secondLineLoLo);
     768           4 :         const auto sumSquaresLoHi = _mm_add_pd(firstLineLoHi, secondLineLoHi);
     769           4 :         const auto sumSquaresHiLo = _mm_add_pd(firstLineHiLo, secondLineHiLo);
     770           4 :         const auto sumSquaresHiHi = _mm_add_pd(firstLineHiHi, secondLineHiHi);
     771             : 
     772             :         // Horizontal addition of squares
     773           4 :         const auto sumSquaresLo = sse2_hadd_pd(sumSquaresLoLo, sumSquaresLoHi);
     774           4 :         const auto sumSquaresHi = sse2_hadd_pd(sumSquaresHiLo, sumSquaresHiHi);
     775             : 
     776           4 :         const auto sumDivWeightLo = _mm_mul_pd(sumSquaresLo, zeroDot25);
     777           4 :         const auto sumDivWeightHi = _mm_mul_pd(sumSquaresHi, zeroDot25);
     778             :         // Take square root and truncate/floor to int32
     779           8 :         const auto rmsLo = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightLo));
     780           8 :         const auto rmsHi = _mm_cvttpd_epi32(_mm_sqrt_pd(sumDivWeightHi));
     781             : 
     782             :         // Correctly round rms to minimize | rms^2 - sumSquares / 4 |
     783             :         // if( 0.5 < sumDivWeight - (rms * rms + rms) )
     784             :         //     rms += 1;
     785           4 :         const auto rmsLoDouble = _mm_cvtepi32_pd(rmsLo);
     786           4 :         const auto rmsHiDouble = _mm_cvtepi32_pd(rmsHi);
     787           8 :         const auto rightLo = _mm_sub_pd(
     788             :             sumDivWeightLo, _mm_add_pd(SQUARE(rmsLoDouble), rmsLoDouble));
     789          12 :         const auto rightHi = _mm_sub_pd(
     790             :             sumDivWeightHi, _mm_add_pd(SQUARE(rmsHiDouble), rmsHiDouble));
     791             : 
     792           8 :         const auto maskLo = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightLo));
     793           4 :         const auto maskHi = _mm_castpd_ps(_mm_cmplt_pd(zeroDot5, rightHi));
     794             :         // The value of the mask will be -1 when the correction needs to be
     795             :         // applied
     796           8 :         const auto mask = _mm_castps_si128(_mm_shuffle_ps(
     797             :             maskLo, maskHi, (0 << 0) | (2 << 2) | (0 << 4) | (2 << 6)));
     798             : 
     799          16 :         auto rms = _mm_castps_si128(
     800             :             _mm_movelh_ps(_mm_castsi128_ps(rmsLo), _mm_castsi128_ps(rmsHi)));
     801             :         // Apply the correction
     802           4 :         rms = _mm_sub_epi32(rms, mask);
     803             : 
     804             :         // Pack each 32 bit RMS value to 16 bits
     805           4 :         rms = sse2_packus_epi32(rms, rms /* could be anything */);
     806             : #endif
     807             : 
     808           4 :         _mm_storel_epi64(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     809             :                          rms);
     810           4 :         pSrcScanlineShifted += 8;
     811             :     }
     812             : 
     813             :     zeroupper();
     814             : 
     815          10 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     816          10 :     return iDstPixel;
     817             : }
     818             : 
     819             : /************************************************************************/
     820             : /*                         AverageUInt16SSE2()                          */
     821             : /************************************************************************/
     822             : 
     823             : template <class T>
     824           9 : static int AverageUInt16SSE2(int nDstXWidth, int nChunkXSize,
     825             :                              const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     826             :                              T *CPL_RESTRICT pDstScanline)
     827             : {
     828             :     // Optimized implementation for average on UInt16 by
     829             :     // processing by group of 8 output pixels.
     830             : 
     831           9 :     const auto mask = _mm_set1_epi32(0xFFFF);
     832           9 :     const auto two = _mm_set1_epi32(2);
     833           9 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     834             : 
     835           9 :     int iDstPixel = 0;
     836          13 :     for (; iDstPixel < nDstXWidth - 7; iDstPixel += 8)
     837             :     {
     838             :         __m128i averageLow;
     839             :         // Load 8 UInt16 from each line
     840             :         {
     841           4 :             const auto firstLine = _mm_loadu_si128(
     842             :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted));
     843             :             const auto secondLine =
     844           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     845           4 :                     pSrcScanlineShifted + nChunkXSize));
     846             : 
     847             :             // Horizontal addition and extension to 32 bit
     848          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     849             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     850             :             const auto horizAddSecondLine =
     851          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     852             :                               _mm_srli_epi32(secondLine, 16));
     853             : 
     854             :             // Vertical addition and average computation
     855             :             // average = (sum + 2) >> 2
     856           8 :             const auto sum = _mm_add_epi32(
     857             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     858           4 :             averageLow = _mm_srli_epi32(sum, 2);
     859             :         }
     860             :         // Load 8 UInt16 from each line
     861             :         __m128i averageHigh;
     862             :         {
     863           4 :             const auto firstLine = _mm_loadu_si128(
     864           4 :                 reinterpret_cast<__m128i const *>(pSrcScanlineShifted + 8));
     865             :             const auto secondLine =
     866           4 :                 _mm_loadu_si128(reinterpret_cast<__m128i const *>(
     867           4 :                     pSrcScanlineShifted + 8 + nChunkXSize));
     868             : 
     869             :             // Horizontal addition and extension to 32 bit
     870          12 :             const auto horizAddFirstLine = _mm_add_epi32(
     871             :                 _mm_and_si128(firstLine, mask), _mm_srli_epi32(firstLine, 16));
     872             :             const auto horizAddSecondLine =
     873          12 :                 _mm_add_epi32(_mm_and_si128(secondLine, mask),
     874             :                               _mm_srli_epi32(secondLine, 16));
     875             : 
     876             :             // Vertical addition and average computation
     877             :             // average = (sum + 2) >> 2
     878           8 :             const auto sum = _mm_add_epi32(
     879             :                 _mm_add_epi32(horizAddFirstLine, horizAddSecondLine), two);
     880           4 :             averageHigh = _mm_srli_epi32(sum, 2);
     881             :         }
     882             : 
     883             :         // Pack each 32 bit average value to 16 bits
     884           4 :         auto average = sse2_packus_epi32(averageLow, averageHigh);
     885           4 :         _mm_storeu_si128(reinterpret_cast<__m128i *>(&pDstScanline[iDstPixel]),
     886             :                          average);
     887           4 :         pSrcScanlineShifted += 16;
     888             :     }
     889             : 
     890           9 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
     891           9 :     return iDstPixel;
     892             : }
     893             : 
     894             : /************************************************************************/
     895             : /*                      QuadraticMeanFloatSSE2()                        */
     896             : /************************************************************************/
     897             : 
     898             : #ifdef __AVX2__
     899             : #define RMS_FLOAT_ELTS 8
     900             : #define set1_ps _mm256_set1_ps
     901             : #define loadu_ps _mm256_loadu_ps
     902             : #define andnot_ps _mm256_andnot_ps
     903             : #define and_ps _mm256_and_ps
     904             : #define max_ps _mm256_max_ps
     905             : #define shuffle_ps _mm256_shuffle_ps
     906             : #define div_ps _mm256_div_ps
     907             : #define cmpeq_ps(x, y) _mm256_cmp_ps(x, y, _CMP_EQ_OQ)
     908             : #define mul_ps _mm256_mul_ps
     909             : #define add_ps _mm256_add_ps
     910             : #define hadd_ps _mm256_hadd_ps
     911             : #define sqrt_ps _mm256_sqrt_ps
     912             : #define or_ps _mm256_or_ps
     913             : #define unpacklo_ps _mm256_unpacklo_ps
     914             : #define unpackhi_ps _mm256_unpackhi_ps
     915             : #define storeu_ps _mm256_storeu_ps
     916             : 
     917             : inline __m256 SQUARE(__m256 x)
     918             : {
     919             :     return _mm256_mul_ps(x, x);
     920             : }
     921             : 
     922             : #else
     923             : 
     924             : #ifdef __SSE3__
     925             : #define sse2_hadd_ps _mm_hadd_ps
     926             : #else
     927             : inline __m128 sse2_hadd_ps(__m128 a, __m128 b)
     928             : {
     929             :     auto aEven_bEven = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
     930             :     auto aOdd_bOdd = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
     931             :     return _mm_add_ps(aEven_bEven, aOdd_bOdd);  // (aEven + aOdd, bEven + bOdd)
     932             : }
     933             : #endif
     934             : 
     935             : #define RMS_FLOAT_ELTS 4
     936             : #define set1_ps _mm_set1_ps
     937             : #define loadu_ps _mm_loadu_ps
     938             : #define andnot_ps _mm_andnot_ps
     939             : #define and_ps _mm_and_ps
     940             : #define max_ps _mm_max_ps
     941             : #define shuffle_ps _mm_shuffle_ps
     942             : #define div_ps _mm_div_ps
     943             : #define cmpeq_ps _mm_cmpeq_ps
     944             : #define mul_ps _mm_mul_ps
     945             : #define add_ps _mm_add_ps
     946             : #define hadd_ps sse2_hadd_ps
     947             : #define sqrt_ps _mm_sqrt_ps
     948             : #define or_ps _mm_or_ps
     949             : #define unpacklo_ps _mm_unpacklo_ps
     950             : #define unpackhi_ps _mm_unpackhi_ps
     951             : #define storeu_ps _mm_storeu_ps
     952             : 
     953         272 : inline __m128 SQUARE(__m128 x)
     954             : {
     955         272 :     return _mm_mul_ps(x, x);
     956             : }
     957             : 
     958          68 : inline __m128 FIXUP_LANES(__m128 x)
     959             : {
     960          68 :     return x;
     961             : }
     962             : 
     963             : #endif
     964             : 
     965             : template <class T>
     966             : static int NOINLINE
     967          34 : QuadraticMeanFloatSSE2(int nDstXWidth, int nChunkXSize,
     968             :                        const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
     969             :                        T *CPL_RESTRICT pDstScanline)
     970             : {
     971             :     // Optimized implementation for RMS on Float32 by
     972             :     // processing by group of RMS_FLOAT_ELTS output pixels.
     973          34 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
     974             : 
     975          34 :     int iDstPixel = 0;
     976          34 :     const auto minus_zero = set1_ps(-0.0f);
     977          34 :     const auto zeroDot25 = set1_ps(0.25f);
     978          34 :     const auto one = set1_ps(1.0f);
     979          68 :     const auto infv = set1_ps(std::numeric_limits<float>::infinity());
     980             : 
     981         102 :     for (; iDstPixel < nDstXWidth - (RMS_FLOAT_ELTS - 1);
     982             :          iDstPixel += RMS_FLOAT_ELTS)
     983             :     {
     984             :         // Load 2*RMS_FLOAT_ELTS Float32 from each line
     985             :         auto firstLineLo =
     986          68 :             loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
     987          68 :         auto firstLineHi = loadu_ps(reinterpret_cast<float const *>(
     988          68 :             pSrcScanlineShifted + RMS_FLOAT_ELTS));
     989          68 :         auto secondLineLo = loadu_ps(
     990          68 :             reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
     991          68 :         auto secondLineHi = loadu_ps(reinterpret_cast<float const *>(
     992          68 :             pSrcScanlineShifted + RMS_FLOAT_ELTS + nChunkXSize));
     993             : 
     994             :         // Take the absolute value
     995          68 :         firstLineLo = andnot_ps(minus_zero, firstLineLo);
     996          68 :         firstLineHi = andnot_ps(minus_zero, firstLineHi);
     997          68 :         secondLineLo = andnot_ps(minus_zero, secondLineLo);
     998          68 :         secondLineHi = andnot_ps(minus_zero, secondLineHi);
     999             : 
    1000             :         auto firstLineEven =
    1001          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1002             :         auto firstLineOdd =
    1003          68 :             shuffle_ps(firstLineLo, firstLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1004             :         auto secondLineEven =
    1005          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(2, 0, 2, 0));
    1006             :         auto secondLineOdd =
    1007          68 :             shuffle_ps(secondLineLo, secondLineHi, _MM_SHUFFLE(3, 1, 3, 1));
    1008             : 
    1009             :         // Compute the maximum of each RMS_FLOAT_ELTS value to RMS-average
    1010         204 :         const auto maxV = max_ps(max_ps(firstLineEven, firstLineOdd),
    1011             :                                  max_ps(secondLineEven, secondLineEven));
    1012             : 
    1013             :         // Normalize each value by the maximum of the RMS_FLOAT_ELTS ones.
    1014             :         // This step is important to avoid that the square evaluates to infinity
    1015             :         // for sufficiently big input.
    1016          68 :         auto invMax = div_ps(one, maxV);
    1017             :         // Deal with 0 being the maximum to correct division by zero
    1018             :         // note: comparing to -0 leads to identical results as to comparing with
    1019             :         // 0
    1020         136 :         invMax = andnot_ps(cmpeq_ps(maxV, minus_zero), invMax);
    1021             : 
    1022          68 :         firstLineEven = mul_ps(firstLineEven, invMax);
    1023          68 :         firstLineOdd = mul_ps(firstLineOdd, invMax);
    1024          68 :         secondLineEven = mul_ps(secondLineEven, invMax);
    1025          68 :         secondLineOdd = mul_ps(secondLineOdd, invMax);
    1026             : 
    1027             :         // Compute squares
    1028          68 :         firstLineEven = SQUARE(firstLineEven);
    1029          68 :         firstLineOdd = SQUARE(firstLineOdd);
    1030          68 :         secondLineEven = SQUARE(secondLineEven);
    1031          68 :         secondLineOdd = SQUARE(secondLineOdd);
    1032             : 
    1033         204 :         const auto sumSquares = add_ps(add_ps(firstLineEven, firstLineOdd),
    1034             :                                        add_ps(secondLineEven, secondLineOdd));
    1035             : 
    1036         204 :         auto rms = mul_ps(maxV, sqrt_ps(mul_ps(sumSquares, zeroDot25)));
    1037             : 
    1038             :         // Deal with infinity being the maximum
    1039          68 :         const auto maskIsInf = cmpeq_ps(maxV, infv);
    1040         136 :         rms = or_ps(andnot_ps(maskIsInf, rms), and_ps(maskIsInf, infv));
    1041             : 
    1042          68 :         rms = FIXUP_LANES(rms);
    1043             : 
    1044             :         // coverity[incompatible_cast]
    1045          68 :         storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]), rms);
    1046          68 :         pSrcScanlineShifted += RMS_FLOAT_ELTS * 2;
    1047             :     }
    1048             : 
    1049             :     zeroupper();
    1050             : 
    1051          34 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1052          34 :     return iDstPixel;
    1053             : }
    1054             : 
    1055             : /************************************************************************/
    1056             : /*                        AverageFloatSSE2()                            */
    1057             : /************************************************************************/
    1058             : 
    1059             : template <class T>
    1060          27 : static int AverageFloatSSE2(int nDstXWidth, int nChunkXSize,
    1061             :                             const T *&CPL_RESTRICT pSrcScanlineShiftedInOut,
    1062             :                             T *CPL_RESTRICT pDstScanline)
    1063             : {
    1064             :     // Optimized implementation for average on Float32 by
    1065             :     // processing by group of 4 output pixels.
    1066          27 :     const T *CPL_RESTRICT pSrcScanlineShifted = pSrcScanlineShiftedInOut;
    1067             : 
    1068          27 :     int iDstPixel = 0;
    1069          27 :     const auto zeroDot25 = _mm_set1_ps(0.25f);
    1070             : 
    1071          55 :     for (; iDstPixel < nDstXWidth - 3; iDstPixel += 4)
    1072             :     {
    1073             :         // Load 8 Float32 from each line
    1074             :         const auto firstLineLo =
    1075          28 :             _mm_loadu_ps(reinterpret_cast<float const *>(pSrcScanlineShifted));
    1076          28 :         const auto firstLineHi = _mm_loadu_ps(
    1077          28 :             reinterpret_cast<float const *>(pSrcScanlineShifted + 4));
    1078          28 :         const auto secondLineLo = _mm_loadu_ps(
    1079          28 :             reinterpret_cast<float const *>(pSrcScanlineShifted + nChunkXSize));
    1080          28 :         const auto secondLineHi = _mm_loadu_ps(reinterpret_cast<float const *>(
    1081          28 :             pSrcScanlineShifted + 4 + nChunkXSize));
    1082             : 
    1083             :         // Vertical addition
    1084          28 :         const auto sumLo = _mm_add_ps(firstLineLo, secondLineLo);
    1085          28 :         const auto sumHi = _mm_add_ps(firstLineHi, secondLineHi);
    1086             : 
    1087             :         // Horizontal addition
    1088             :         const auto A =
    1089          28 :             _mm_shuffle_ps(sumLo, sumHi, 0 | (2 << 2) | (0 << 4) | (2 << 6));
    1090             :         const auto B =
    1091          28 :             _mm_shuffle_ps(sumLo, sumHi, 1 | (3 << 2) | (1 << 4) | (3 << 6));
    1092          28 :         const auto sum = _mm_add_ps(A, B);
    1093             : 
    1094          28 :         const auto average = _mm_mul_ps(sum, zeroDot25);
    1095             : 
    1096             :         // coverity[incompatible_cast]
    1097          28 :         _mm_storeu_ps(reinterpret_cast<float *>(&pDstScanline[iDstPixel]),
    1098             :                       average);
    1099          28 :         pSrcScanlineShifted += 8;
    1100             :     }
    1101             : 
    1102          27 :     pSrcScanlineShiftedInOut = pSrcScanlineShifted;
    1103          27 :     return iDstPixel;
    1104             : }
    1105             : 
    1106             : #endif
    1107             : 
    1108             : /************************************************************************/
    1109             : /*                    GDALResampleChunk_AverageOrRMS()                  */
    1110             : /************************************************************************/
    1111             : 
    1112             : template <class T, class Tsum, GDALDataType eWrkDataType>
    1113        2287 : static CPLErr GDALResampleChunk_AverageOrRMS_T(
    1114             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
    1115             :     double dfSrcYDelta, const T *pChunk, const GByte *pabyChunkNodataMask,
    1116             :     int nChunkXOff, int nChunkXSize, int nChunkYOff, int nChunkYSize,
    1117             :     int nDstXOff, int nDstXOff2, int nDstYOff, int nDstYOff2,
    1118             :     GDALRasterBand *poOverview, void **ppDstBuffer, const char *pszResampling,
    1119             :     bool bHasNoData, double dfNoDataValue, GDALColorTable *poColorTable,
    1120             :     bool bPropagateNoData)
    1121             : {
    1122             :     // AVERAGE_BIT2GRAYSCALE
    1123             :     const bool bBit2Grayscale =
    1124        2287 :         CPL_TO_BOOL(STARTS_WITH_CI(pszResampling, "AVERAGE_BIT2G"));
    1125        2287 :     const bool bQuadraticMean = CPL_TO_BOOL(EQUAL(pszResampling, "RMS"));
    1126        2287 :     if (bBit2Grayscale)
    1127           9 :         poColorTable = nullptr;
    1128             : 
    1129             :     T tNoDataValue;
    1130        2287 :     if (!bHasNoData)
    1131        2240 :         tNoDataValue = 0;
    1132             :     else
    1133          47 :         tNoDataValue = static_cast<T>(dfNoDataValue);
    1134        2287 :     const T tReplacementVal =
    1135          77 :         bHasNoData ? static_cast<T>(GDALGetNoDataReplacementValue(
    1136             :                          poOverview->GetRasterDataType(), dfNoDataValue))
    1137             :                    : 0;
    1138             : 
    1139        2287 :     int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1140        2287 :     int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1141        2287 :     int nDstXWidth = nDstXOff2 - nDstXOff;
    1142             : 
    1143             :     /* -------------------------------------------------------------------- */
    1144             :     /*      Allocate buffers.                                               */
    1145             :     /* -------------------------------------------------------------------- */
    1146        2287 :     *ppDstBuffer = static_cast<T *>(
    1147        2287 :         VSI_MALLOC3_VERBOSE(nDstXWidth, nDstYOff2 - nDstYOff,
    1148             :                             GDALGetDataTypeSizeBytes(eWrkDataType)));
    1149        2287 :     if (*ppDstBuffer == nullptr)
    1150             :     {
    1151           0 :         return CE_Failure;
    1152             :     }
    1153        2287 :     T *const pDstBuffer = static_cast<T *>(*ppDstBuffer);
    1154             : 
    1155             :     struct PrecomputedXValue
    1156             :     {
    1157             :         int nLeftXOffShifted;
    1158             :         int nRightXOffShifted;
    1159             :         double dfLeftWeight;
    1160             :         double dfRightWeight;
    1161             :         double dfTotalWeightFullLine;
    1162             :     };
    1163             : 
    1164             :     PrecomputedXValue *pasSrcX = static_cast<PrecomputedXValue *>(
    1165        2287 :         VSI_MALLOC_VERBOSE(nDstXWidth * sizeof(PrecomputedXValue)));
    1166             : 
    1167        2287 :     if (pasSrcX == nullptr)
    1168             :     {
    1169           0 :         VSIFree(pasSrcX);
    1170           0 :         return CE_Failure;
    1171             :     }
    1172             : 
    1173        2287 :     int nTransparentIdx = -1;
    1174        2287 :     std::vector<GDALColorEntry> colorEntries;
    1175        2287 :     if (poColorTable)
    1176           5 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1177             : 
    1178             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1179             :     // it as nodata value
    1180        2310 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1181          23 :         tNoDataValue < colorEntries.size())
    1182           1 :         colorEntries[static_cast<int>(tNoDataValue)].c4 = 0;
    1183             : 
    1184             :     // Or if we have no explicit nodata, but a color table entry that is
    1185             :     // transparent, consider it as the nodata value
    1186        2286 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1187             :     {
    1188           0 :         bHasNoData = TRUE;
    1189           0 :         tNoDataValue = static_cast<T>(nTransparentIdx);
    1190             :     }
    1191             : 
    1192             :     /* ==================================================================== */
    1193             :     /*      Precompute inner loop constants.                                */
    1194             :     /* ==================================================================== */
    1195        2287 :     bool bSrcXSpacingIsTwo = true;
    1196        2287 :     int nLastSrcXOff2 = -1;
    1197      848986 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1198             :     {
    1199      846699 :         double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    1200             :         // Apply some epsilon to avoid numerical precision issues
    1201      846699 :         int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    1202      846699 :         double dfSrcXOff2 = dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    1203      846699 :         int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    1204             : 
    1205      846699 :         if (nSrcXOff < nChunkXOff)
    1206           0 :             nSrcXOff = nChunkXOff;
    1207      846699 :         if (nSrcXOff2 == nSrcXOff)
    1208           0 :             nSrcXOff2++;
    1209      846699 :         if (nSrcXOff2 > nChunkRightXOff)
    1210           1 :             nSrcXOff2 = nChunkRightXOff;
    1211             : 
    1212      846699 :         pasSrcX[iDstPixel - nDstXOff].nLeftXOffShifted = nSrcXOff - nChunkXOff;
    1213      846699 :         pasSrcX[iDstPixel - nDstXOff].nRightXOffShifted =
    1214      846699 :             nSrcXOff2 - nChunkXOff;
    1215          21 :         pasSrcX[iDstPixel - nDstXOff].dfLeftWeight =
    1216      846699 :             (nSrcXOff2 == nSrcXOff + 1) ? 1.0 : 1 - (dfSrcXOff - nSrcXOff);
    1217      846699 :         pasSrcX[iDstPixel - nDstXOff].dfRightWeight =
    1218      846699 :             1 - (nSrcXOff2 - dfSrcXOff2);
    1219      846699 :         pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine =
    1220      846699 :             pasSrcX[iDstPixel - nDstXOff].dfLeftWeight;
    1221      846699 :         if (nSrcXOff + 1 < nSrcXOff2)
    1222             :         {
    1223      846678 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1224      846678 :                 nSrcXOff2 - nSrcXOff - 2;
    1225      846678 :             pasSrcX[iDstPixel - nDstXOff].dfTotalWeightFullLine +=
    1226      846678 :                 pasSrcX[iDstPixel - nDstXOff].dfRightWeight;
    1227             :         }
    1228             : 
    1229      846699 :         if (nSrcXOff2 - nSrcXOff != 2 ||
    1230      726341 :             (nLastSrcXOff2 >= 0 && nLastSrcXOff2 != nSrcXOff))
    1231             :         {
    1232      119568 :             bSrcXSpacingIsTwo = false;
    1233             :         }
    1234      846699 :         nLastSrcXOff2 = nSrcXOff2;
    1235             :     }
    1236             : 
    1237             :     /* ==================================================================== */
    1238             :     /*      Loop over destination scanlines.                                */
    1239             :     /* ==================================================================== */
    1240      718529 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1241             :     {
    1242      716242 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    1243      716242 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    1244      716242 :         if (nSrcYOff < nChunkYOff)
    1245           0 :             nSrcYOff = nChunkYOff;
    1246             : 
    1247      716242 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    1248      716242 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    1249      716242 :         if (nSrcYOff2 == nSrcYOff)
    1250           0 :             ++nSrcYOff2;
    1251      716242 :         if (nSrcYOff2 > nChunkBottomYOff)
    1252           3 :             nSrcYOff2 = nChunkBottomYOff;
    1253             : 
    1254      716242 :         T *const pDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1255             : 
    1256             :         /* --------------------------------------------------------------------
    1257             :          */
    1258             :         /*      Loop over destination pixels */
    1259             :         /* --------------------------------------------------------------------
    1260             :          */
    1261      716242 :         if (poColorTable == nullptr)
    1262             :         {
    1263      716127 :             if (bSrcXSpacingIsTwo && nSrcYOff2 == nSrcYOff + 2 &&
    1264             :                 pabyChunkNodataMask == nullptr)
    1265             :             {
    1266             :                 if (eWrkDataType == GDT_Byte || eWrkDataType == GDT_UInt16)
    1267             :                 {
    1268             :                     // Optimized case : no nodata, overview by a factor of 2 and
    1269             :                     // regular x and y src spacing.
    1270      116350 :                     const T *pSrcScanlineShifted =
    1271      116350 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1272      116350 :                         static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
    1273      116350 :                             nChunkXSize;
    1274      116350 :                     int iDstPixel = 0;
    1275             : #ifdef USE_SSE2
    1276      116331 :                     if (bQuadraticMean && eWrkDataType == GDT_Byte)
    1277             :                     {
    1278        5385 :                         iDstPixel = QuadraticMeanByteSSE2OrAVX2(
    1279             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1280             :                             pDstScanline);
    1281             :                     }
    1282      110965 :                     else if (bQuadraticMean /* && eWrkDataType == GDT_UInt16 */)
    1283             :                     {
    1284          10 :                         iDstPixel = QuadraticMeanUInt16SSE2(
    1285             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1286             :                             pDstScanline);
    1287             :                     }
    1288             :                     else if (/* !bQuadraticMean && */ eWrkDataType == GDT_Byte)
    1289             :                     {
    1290      110946 :                         iDstPixel = AverageByteSSE2OrAVX2(
    1291             :                             nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1292             :                             pDstScanline);
    1293             :                     }
    1294             :                     else /* if( !bQuadraticMean && eWrkDataType == GDT_UInt16 )
    1295             :                           */
    1296             :                     {
    1297           9 :                         iDstPixel = AverageUInt16SSE2(nDstXWidth, nChunkXSize,
    1298             :                                                       pSrcScanlineShifted,
    1299             :                                                       pDstScanline);
    1300             :                     }
    1301             : #endif
    1302      278621 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1303             :                     {
    1304      162271 :                         Tsum nTotal = 0;
    1305             :                         T nVal;
    1306      162271 :                         if (bQuadraticMean)
    1307          44 :                             nTotal =
    1308          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[0]) +
    1309          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[1]) +
    1310          44 :                                 SQUARE<Tsum>(pSrcScanlineShifted[nChunkXSize]) +
    1311          44 :                                 SQUARE<Tsum>(
    1312          44 :                                     pSrcScanlineShifted[1 + nChunkXSize]);
    1313             :                         else
    1314      162227 :                             nTotal = pSrcScanlineShifted[0] +
    1315      162227 :                                      pSrcScanlineShifted[1] +
    1316      162227 :                                      pSrcScanlineShifted[nChunkXSize] +
    1317      162227 :                                      pSrcScanlineShifted[1 + nChunkXSize];
    1318             : 
    1319      162271 :                         constexpr int nTotalWeight = 4;
    1320      162271 :                         if (bQuadraticMean)
    1321          44 :                             nVal = ComputeIntegerRMS_4values<T>(nTotal);
    1322             :                         else
    1323      162227 :                             nVal = static_cast<T>((nTotal + nTotalWeight / 2) /
    1324             :                                                   nTotalWeight);
    1325             : 
    1326             :                         // No need to compare nVal against tNoDataValue as we
    1327             :                         // are in a case where pabyChunkNodataMask == nullptr
    1328             :                         // implies the absence of nodata value.
    1329      162271 :                         pDstScanline[iDstPixel] = nVal;
    1330      162271 :                         pSrcScanlineShifted += 2;
    1331             :                     }
    1332             :                 }
    1333             :                 else
    1334             :                 {
    1335             :                     CPLAssert(eWrkDataType == GDT_Float32 ||
    1336             :                               eWrkDataType == GDT_Float64);
    1337          70 :                     const T *pSrcScanlineShifted =
    1338          70 :                         pChunk + pasSrcX[0].nLeftXOffShifted +
    1339          70 :                         static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) *
    1340          70 :                             nChunkXSize;
    1341          70 :                     int iDstPixel = 0;
    1342             : #ifdef USE_SSE2
    1343             :                     if (eWrkDataType == GDT_Float32)
    1344             :                     {
    1345          61 :                         if (bQuadraticMean)
    1346             :                         {
    1347          34 :                             iDstPixel = QuadraticMeanFloatSSE2(
    1348             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1349             :                                 pDstScanline);
    1350             :                         }
    1351             :                         else
    1352             :                         {
    1353          27 :                             iDstPixel = AverageFloatSSE2(
    1354             :                                 nDstXWidth, nChunkXSize, pSrcScanlineShifted,
    1355             :                                 pDstScanline);
    1356             :                         }
    1357             :                     }
    1358             : #endif
    1359             : 
    1360         228 :                     for (; iDstPixel < nDstXWidth; ++iDstPixel)
    1361             :                     {
    1362             :                         T nVal;
    1363         158 :                         if (bQuadraticMean)
    1364             :                         {
    1365             :                             // Cast to double to avoid overflows
    1366             :                             // (using std::hypot() is much slower)
    1367         100 :                             nVal = static_cast<T>(std::sqrt(
    1368             :                                 0.25 *
    1369         100 :                                 (SQUARE<double>(pSrcScanlineShifted[0]) +
    1370         100 :                                  SQUARE<double>(pSrcScanlineShifted[1]) +
    1371         100 :                                  SQUARE<double>(
    1372         200 :                                      pSrcScanlineShifted[nChunkXSize]) +
    1373         100 :                                  SQUARE<double>(
    1374         100 :                                      pSrcScanlineShifted[1 + nChunkXSize]))));
    1375             :                         }
    1376             :                         else
    1377             :                         {
    1378          58 :                             nVal = static_cast<T>(
    1379          58 :                                 0.25f * (pSrcScanlineShifted[0] +
    1380          58 :                                          pSrcScanlineShifted[1] +
    1381          58 :                                          pSrcScanlineShifted[nChunkXSize] +
    1382          58 :                                          pSrcScanlineShifted[1 + nChunkXSize]));
    1383             :                         }
    1384             : 
    1385             :                         // No need to compare nVal against tNoDataValue as we
    1386             :                         // are in a case where pabyChunkNodataMask == nullptr
    1387             :                         // implies the absence of nodata value.
    1388         158 :                         pDstScanline[iDstPixel] = nVal;
    1389         158 :                         pSrcScanlineShifted += 2;
    1390             :                     }
    1391      116420 :                 }
    1392             :             }
    1393             :             else
    1394             :             {
    1395          17 :                 const double dfBottomWeight =
    1396      599707 :                     (nSrcYOff + 1 == nSrcYOff2) ? 1.0
    1397      599690 :                                                 : 1.0 - (dfSrcYOff - nSrcYOff);
    1398      599707 :                 const double dfTopWeight = 1.0 - (nSrcYOff2 - dfSrcYOff2);
    1399      599707 :                 nSrcYOff -= nChunkYOff;
    1400      599707 :                 nSrcYOff2 -= nChunkYOff;
    1401             : 
    1402      599707 :                 double dfTotalWeightFullColumn = dfBottomWeight;
    1403      599707 :                 if (nSrcYOff + 1 < nSrcYOff2)
    1404             :                 {
    1405      599690 :                     dfTotalWeightFullColumn += nSrcYOff2 - nSrcYOff - 2;
    1406      599690 :                     dfTotalWeightFullColumn += dfTopWeight;
    1407             :                 }
    1408             : 
    1409    18032356 :                 for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1410             :                 {
    1411    17431981 :                     const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1412    17431981 :                     const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1413             : 
    1414    17431981 :                     double dfTotal = 0;
    1415    17431981 :                     double dfTotalWeight = 0;
    1416    17431981 :                     if (pabyChunkNodataMask == nullptr)
    1417             :                     {
    1418     1746435 :                         auto pChunkShifted =
    1419         115 :                             pChunk +
    1420     1746435 :                             static_cast<GPtrDiff_t>(nSrcYOff) * nChunkXSize;
    1421     1746435 :                         int nCounterY = nSrcYOff2 - nSrcYOff - 1;
    1422     1746435 :                         double dfWeightY = dfBottomWeight;
    1423     3493427 :                         while (true)
    1424             :                         {
    1425             :                             double dfTotalLine;
    1426     5239852 :                             if (bQuadraticMean)
    1427             :                             {
    1428             :                                 // Left pixel
    1429             :                                 {
    1430         104 :                                     const T val = pChunkShifted[nSrcXOff];
    1431         104 :                                     dfTotalLine =
    1432         104 :                                         SQUARE<double>(val) *
    1433         104 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1434             :                                 }
    1435             : 
    1436         104 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1437             :                                 {
    1438             :                                     // Middle pixels
    1439         104 :                                     for (int iX = nSrcXOff + 1;
    1440         424 :                                          iX + 1 < nSrcXOff2; ++iX)
    1441             :                                     {
    1442         320 :                                         const T val = pChunkShifted[iX];
    1443         320 :                                         dfTotalLine += SQUARE<double>(val);
    1444             :                                     }
    1445             : 
    1446             :                                     // Right pixel
    1447             :                                     {
    1448         104 :                                         const T val =
    1449         104 :                                             pChunkShifted[nSrcXOff2 - 1];
    1450         104 :                                         dfTotalLine +=
    1451         104 :                                             SQUARE<double>(val) *
    1452         104 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1453             :                                     }
    1454             :                                 }
    1455             :                             }
    1456             :                             else
    1457             :                             {
    1458             :                                 // Left pixel
    1459             :                                 {
    1460     5239756 :                                     const T val = pChunkShifted[nSrcXOff];
    1461     5239756 :                                     dfTotalLine =
    1462     5239756 :                                         val * pasSrcX[iDstPixel].dfLeftWeight;
    1463             :                                 }
    1464             : 
    1465     5239756 :                                 if (nSrcXOff + 1 < nSrcXOff2)
    1466             :                                 {
    1467             :                                     // Middle pixels
    1468     4239330 :                                     for (int iX = nSrcXOff + 1;
    1469    64183126 :                                          iX + 1 < nSrcXOff2; ++iX)
    1470             :                                     {
    1471    59943836 :                                         const T val = pChunkShifted[iX];
    1472    59943836 :                                         dfTotalLine += val;
    1473             :                                     }
    1474             : 
    1475             :                                     // Right pixel
    1476             :                                     {
    1477     4239330 :                                         const T val =
    1478     4239330 :                                             pChunkShifted[nSrcXOff2 - 1];
    1479     4239330 :                                         dfTotalLine +=
    1480     4239330 :                                             val *
    1481     4239330 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1482             :                                     }
    1483             :                                 }
    1484             :                             }
    1485             : 
    1486     5239852 :                             dfTotal += dfTotalLine * dfWeightY;
    1487     5239852 :                             --nCounterY;
    1488     5239852 :                             if (nCounterY < 0)
    1489     1746435 :                                 break;
    1490     3493427 :                             pChunkShifted += nChunkXSize;
    1491     3493427 :                             dfWeightY = (nCounterY == 0) ? dfTopWeight : 1.0;
    1492             :                         }
    1493             : 
    1494     1746435 :                         dfTotalWeight =
    1495     1746435 :                             pasSrcX[iDstPixel].dfTotalWeightFullLine *
    1496             :                             dfTotalWeightFullColumn;
    1497             :                     }
    1498             :                     else
    1499             :                     {
    1500    15685566 :                         GPtrDiff_t nCount = 0;
    1501    69080198 :                         for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1502             :                         {
    1503    53394432 :                             const auto pChunkShifted =
    1504         132 :                                 pChunk +
    1505    53394432 :                                 static_cast<GPtrDiff_t>(iY) * nChunkXSize;
    1506             : 
    1507    53394432 :                             double dfTotalLine = 0;
    1508    53394432 :                             double dfTotalWeightLine = 0;
    1509             :                             // Left pixel
    1510             :                             {
    1511    53394432 :                                 const int iX = nSrcXOff;
    1512    53394432 :                                 const T val = pChunkShifted[iX];
    1513    53394432 :                                 if (pabyChunkNodataMask[iX + iY * nChunkXSize])
    1514             :                                 {
    1515    23412781 :                                     nCount++;
    1516    23412781 :                                     const double dfWeightX =
    1517    23412781 :                                         pasSrcX[iDstPixel].dfLeftWeight;
    1518    23412781 :                                     dfTotalWeightLine = dfWeightX;
    1519    23412781 :                                     if (bQuadraticMean)
    1520          60 :                                         dfTotalLine =
    1521          60 :                                             SQUARE<double>(val) * dfWeightX;
    1522             :                                     else
    1523    23412781 :                                         dfTotalLine = val * dfWeightX;
    1524             :                                 }
    1525             :                             }
    1526             : 
    1527    53394432 :                             if (nSrcXOff + 1 < nSrcXOff2)
    1528             :                             {
    1529             :                                 // Middle pixels
    1530   141491132 :                                 for (int iX = nSrcXOff + 1; iX + 1 < nSrcXOff2;
    1531             :                                      ++iX)
    1532             :                                 {
    1533    88095700 :                                     const T val = pChunkShifted[iX];
    1534    88095700 :                                     if (pabyChunkNodataMask[iX +
    1535    88095700 :                                                             iY * nChunkXSize])
    1536             :                                     {
    1537    39727500 :                                         nCount++;
    1538    39727500 :                                         dfTotalWeightLine += 1;
    1539    39727500 :                                         if (bQuadraticMean)
    1540           0 :                                             dfTotalLine += SQUARE<double>(val);
    1541             :                                         else
    1542    39727500 :                                             dfTotalLine += val;
    1543             :                                     }
    1544             :                                 }
    1545             : 
    1546             :                                 // Right pixel
    1547             :                                 {
    1548    53395332 :                                     const int iX = nSrcXOff2 - 1;
    1549    53395332 :                                     const T val = pChunkShifted[iX];
    1550    53395332 :                                     if (pabyChunkNodataMask[iX +
    1551    53395332 :                                                             iY * nChunkXSize])
    1552             :                                     {
    1553    23412747 :                                         nCount++;
    1554    23412747 :                                         const double dfWeightX =
    1555    23412747 :                                             pasSrcX[iDstPixel].dfRightWeight;
    1556    23412747 :                                         dfTotalWeightLine += dfWeightX;
    1557    23412747 :                                         if (bQuadraticMean)
    1558           1 :                                             dfTotalLine +=
    1559          61 :                                                 SQUARE<double>(val) * dfWeightX;
    1560             :                                         else
    1561    23412646 :                                             dfTotalLine += val * dfWeightX;
    1562             :                                     }
    1563             :                                 }
    1564             :                             }
    1565             : 
    1566    91105998 :                             const double dfWeightY =
    1567             :                                 (iY == nSrcYOff)        ? dfBottomWeight
    1568    37711466 :                                 : (iY + 1 == nSrcYOff2) ? dfTopWeight
    1569             :                                                         : 1.0;
    1570    53394532 :                             dfTotal += dfTotalLine * dfWeightY;
    1571    53394532 :                             dfTotalWeight += dfTotalWeightLine * dfWeightY;
    1572             :                         }
    1573             : 
    1574    15685766 :                         if (nCount == 0 ||
    1575           8 :                             (bPropagateNoData &&
    1576             :                              nCount <
    1577           8 :                                  static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
    1578           8 :                                      (nSrcXOff2 - nSrcXOff)))
    1579             :                         {
    1580     8937432 :                             pDstScanline[iDstPixel] = tNoDataValue;
    1581     8937432 :                             continue;
    1582             :                         }
    1583             :                     }
    1584             :                     if (eWrkDataType == GDT_Byte)
    1585             :                     {
    1586             :                         T nVal;
    1587     8494610 :                         if (bQuadraticMean)
    1588          38 :                             nVal = ComputeIntegerRMS<T, int>(dfTotal,
    1589             :                                                              dfTotalWeight);
    1590             :                         else
    1591     8494570 :                             nVal =
    1592     8494570 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1593     8495070 :                         if (bHasNoData && nVal == tNoDataValue)
    1594           0 :                             nVal = tReplacementVal;
    1595     8495070 :                         pDstScanline[iDstPixel] = nVal;
    1596             :                     }
    1597             :                     else if (eWrkDataType == GDT_UInt16)
    1598             :                     {
    1599             :                         T nVal;
    1600           8 :                         if (bQuadraticMean)
    1601           4 :                             nVal = ComputeIntegerRMS<T, uint64_t>(
    1602             :                                 dfTotal, dfTotalWeight);
    1603             :                         else
    1604           4 :                             nVal =
    1605           4 :                                 static_cast<T>(dfTotal / dfTotalWeight + 0.5);
    1606           8 :                         if (bHasNoData && nVal == tNoDataValue)
    1607           0 :                             nVal = tReplacementVal;
    1608           8 :                         pDstScanline[iDstPixel] = nVal;
    1609             :                     }
    1610             :                     else
    1611             :                     {
    1612             :                         T nVal;
    1613         151 :                         if (bQuadraticMean)
    1614          20 :                             nVal =
    1615          25 :                                 static_cast<T>(sqrt(dfTotal / dfTotalWeight));
    1616             :                         else
    1617         126 :                             nVal = static_cast<T>(dfTotal / dfTotalWeight);
    1618         151 :                         if (bHasNoData && nVal == tNoDataValue)
    1619           2 :                             nVal = tReplacementVal;
    1620         151 :                         pDstScanline[iDstPixel] = nVal;
    1621             :                     }
    1622             :                 }
    1623             :             }
    1624             :         }
    1625             :         else
    1626             :         {
    1627         115 :             nSrcYOff -= nChunkYOff;
    1628         115 :             nSrcYOff2 -= nChunkYOff;
    1629             : 
    1630        5948 :             for (int iDstPixel = 0; iDstPixel < nDstXWidth; ++iDstPixel)
    1631             :             {
    1632        6475 :                 const int nSrcXOff = pasSrcX[iDstPixel].nLeftXOffShifted;
    1633        6475 :                 const int nSrcXOff2 = pasSrcX[iDstPixel].nRightXOffShifted;
    1634             : 
    1635        6475 :                 GPtrDiff_t nTotalR = 0;
    1636        6475 :                 GPtrDiff_t nTotalG = 0;
    1637        6475 :                 GPtrDiff_t nTotalB = 0;
    1638        6475 :                 GPtrDiff_t nCount = 0;
    1639             : 
    1640       19425 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    1641             :                 {
    1642       38850 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    1643             :                     {
    1644       25900 :                         const T val = pChunk[iX + static_cast<GPtrDiff_t>(iY) *
    1645       25900 :                                                       nChunkXSize];
    1646             :                         // cppcheck-suppress unsignedLessThanZero
    1647       25900 :                         if (val < 0 || val >= colorEntries.size())
    1648           0 :                             continue;
    1649       25900 :                         size_t idx = static_cast<size_t>(val);
    1650       25900 :                         const auto &entry = colorEntries[idx];
    1651       25900 :                         if (entry.c4)
    1652             :                         {
    1653       14128 :                             if (bQuadraticMean)
    1654             :                             {
    1655         800 :                                 nTotalR += SQUARE<int>(entry.c1);
    1656         800 :                                 nTotalG += SQUARE<int>(entry.c2);
    1657         800 :                                 nTotalB += SQUARE<int>(entry.c3);
    1658         800 :                                 ++nCount;
    1659             :                             }
    1660             :                             else
    1661             :                             {
    1662       13328 :                                 nTotalR += entry.c1;
    1663       13328 :                                 nTotalG += entry.c2;
    1664       13328 :                                 nTotalB += entry.c3;
    1665       13328 :                                 ++nCount;
    1666             :                             }
    1667             :                         }
    1668             :                     }
    1669             :                 }
    1670             : 
    1671        6475 :                 if (nCount == 0 ||
    1672           0 :                     (bPropagateNoData &&
    1673           0 :                      nCount < static_cast<GPtrDiff_t>(nSrcYOff2 - nSrcYOff) *
    1674           0 :                                   (nSrcXOff2 - nSrcXOff)))
    1675             :                 {
    1676        2838 :                     pDstScanline[iDstPixel] = tNoDataValue;
    1677             :                 }
    1678             :                 else
    1679             :                 {
    1680             :                     GDALColorEntry color;
    1681        3637 :                     if (bQuadraticMean)
    1682             :                     {
    1683         200 :                         color.c1 =
    1684         200 :                             static_cast<short>(sqrt(nTotalR / nCount) + 0.5);
    1685         200 :                         color.c2 =
    1686         200 :                             static_cast<short>(sqrt(nTotalG / nCount) + 0.5);
    1687         200 :                         color.c3 =
    1688         200 :                             static_cast<short>(sqrt(nTotalB / nCount) + 0.5);
    1689             :                     }
    1690             :                     else
    1691             :                     {
    1692        3437 :                         color.c1 =
    1693        3437 :                             static_cast<short>((nTotalR + nCount / 2) / nCount);
    1694        3437 :                         color.c2 =
    1695        3437 :                             static_cast<short>((nTotalG + nCount / 2) / nCount);
    1696        3437 :                         color.c3 =
    1697        3437 :                             static_cast<short>((nTotalB + nCount / 2) / nCount);
    1698             :                     }
    1699        2995 :                     pDstScanline[iDstPixel] =
    1700        3637 :                         static_cast<T>(BestColorEntry(colorEntries, color));
    1701             :                 }
    1702             :             }
    1703             :         }
    1704             :     }
    1705             : 
    1706        2287 :     CPLFree(pasSrcX);
    1707             : 
    1708        2287 :     return CE_None;
    1709             : }
    1710             : 
    1711        2287 : static CPLErr GDALResampleChunk_AverageOrRMS(
    1712             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
    1713             :     double dfSrcYDelta, GDALDataType eWrkDataType, const void *pChunk,
    1714             :     const GByte *pabyChunkNodataMask, int nChunkXOff, int nChunkXSize,
    1715             :     int nChunkYOff, int nChunkYSize, int nDstXOff, int nDstXOff2, int nDstYOff,
    1716             :     int nDstYOff2, GDALRasterBand *poOverview, void **ppDstBuffer,
    1717             :     GDALDataType *peDstBufferDataType, const char *pszResampling,
    1718             :     bool bHasNoData, double dfNoDataValue, GDALColorTable *poColorTable,
    1719             :     GDALDataType /* eSrcDataType */, bool bPropagateNoData)
    1720             : {
    1721        2287 :     if (eWrkDataType == GDT_Byte)
    1722             :     {
    1723        2222 :         *peDstBufferDataType = eWrkDataType;
    1724        2222 :         return GDALResampleChunk_AverageOrRMS_T<GByte, int, GDT_Byte>(
    1725             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    1726             :             static_cast<const GByte *>(pChunk), pabyChunkNodataMask, nChunkXOff,
    1727             :             nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff, nDstXOff2, nDstYOff,
    1728             :             nDstYOff2, poOverview, ppDstBuffer, pszResampling, bHasNoData,
    1729        2222 :             dfNoDataValue, poColorTable, bPropagateNoData);
    1730             :     }
    1731          65 :     else if (eWrkDataType == GDT_UInt16)
    1732             :     {
    1733           9 :         *peDstBufferDataType = eWrkDataType;
    1734           9 :         if (EQUAL(pszResampling, "RMS"))
    1735             :         {
    1736             :             // Use double as accumulation type, because UInt32 could overflow
    1737             :             return GDALResampleChunk_AverageOrRMS_T<GUInt16, double,
    1738           5 :                                                     GDT_UInt16>(
    1739             :                 dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    1740             :                 static_cast<const GUInt16 *>(pChunk), pabyChunkNodataMask,
    1741             :                 nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    1742             :                 nDstXOff2, nDstYOff, nDstYOff2, poOverview, ppDstBuffer,
    1743             :                 pszResampling, bHasNoData, dfNoDataValue, poColorTable,
    1744           5 :                 bPropagateNoData);
    1745             :         }
    1746             :         else
    1747             :         {
    1748             :             return GDALResampleChunk_AverageOrRMS_T<GUInt16, GUInt32,
    1749           4 :                                                     GDT_UInt16>(
    1750             :                 dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    1751             :                 static_cast<const GUInt16 *>(pChunk), pabyChunkNodataMask,
    1752             :                 nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    1753             :                 nDstXOff2, nDstYOff, nDstYOff2, poOverview, ppDstBuffer,
    1754             :                 pszResampling, bHasNoData, dfNoDataValue, poColorTable,
    1755           4 :                 bPropagateNoData);
    1756             :         }
    1757             :     }
    1758          56 :     else if (eWrkDataType == GDT_Float32)
    1759             :     {
    1760          49 :         *peDstBufferDataType = eWrkDataType;
    1761          49 :         return GDALResampleChunk_AverageOrRMS_T<float, double, GDT_Float32>(
    1762             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    1763             :             static_cast<const float *>(pChunk), pabyChunkNodataMask, nChunkXOff,
    1764             :             nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff, nDstXOff2, nDstYOff,
    1765             :             nDstYOff2, poOverview, ppDstBuffer, pszResampling, bHasNoData,
    1766          49 :             dfNoDataValue, poColorTable, bPropagateNoData);
    1767             :     }
    1768           7 :     else if (eWrkDataType == GDT_Float64)
    1769             :     {
    1770           7 :         *peDstBufferDataType = eWrkDataType;
    1771           7 :         return GDALResampleChunk_AverageOrRMS_T<double, double, GDT_Float64>(
    1772             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    1773             :             static_cast<const double *>(pChunk), pabyChunkNodataMask,
    1774             :             nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    1775             :             nDstXOff2, nDstYOff, nDstYOff2, poOverview, ppDstBuffer,
    1776             :             pszResampling, bHasNoData, dfNoDataValue, poColorTable,
    1777           7 :             bPropagateNoData);
    1778             :     }
    1779             : 
    1780           0 :     CPLAssert(false);
    1781             :     return CE_Failure;
    1782             : }
    1783             : 
    1784             : /************************************************************************/
    1785             : /*                     GDALResampleChunk_Gauss()                        */
    1786             : /************************************************************************/
    1787             : 
    1788          86 : static CPLErr GDALResampleChunk_Gauss(
    1789             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, double /* dfSrcXDelta */,
    1790             :     double /* dfSrcYDelta */, GDALDataType /* eWrkDataType */,
    1791             :     const void *pChunk, const GByte *pabyChunkNodataMask, int nChunkXOff,
    1792             :     int nChunkXSize, int nChunkYOff, int nChunkYSize, int nDstXOff,
    1793             :     int nDstXOff2, int nDstYOff, int nDstYOff2, GDALRasterBand *poOverview,
    1794             :     void **ppDstBuffer, GDALDataType *peDstBufferDataType,
    1795             :     const char * /* pszResampling */, bool bHasNoData, double dfNoDataValue,
    1796             :     GDALColorTable *poColorTable, GDALDataType /* eSrcDataType */,
    1797             :     bool /* bPropagateNoData */)
    1798             : 
    1799             : {
    1800          86 :     const double *const padfChunk = static_cast<const double *>(pChunk);
    1801             : 
    1802          86 :     *ppDstBuffer =
    1803          86 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    1804             :                             GDALGetDataTypeSizeBytes(GDT_Float64));
    1805          86 :     if (*ppDstBuffer == nullptr)
    1806             :     {
    1807           0 :         return CE_Failure;
    1808             :     }
    1809          86 :     *peDstBufferDataType = GDT_Float64;
    1810          86 :     double *const padfDstBuffer = static_cast<double *>(*ppDstBuffer);
    1811             : 
    1812             :     /* -------------------------------------------------------------------- */
    1813             :     /*      Create the filter kernel and allocate scanline buffer.          */
    1814             :     /* -------------------------------------------------------------------- */
    1815          86 :     int nGaussMatrixDim = 3;
    1816             :     const int *panGaussMatrix;
    1817          86 :     constexpr int anGaussMatrix3x3[] = {1, 2, 1, 2, 4, 2, 1, 2, 1};
    1818          86 :     constexpr int anGaussMatrix5x5[] = {1,  4, 6,  4,  1,  4, 16, 24, 16,
    1819             :                                         4,  6, 24, 36, 24, 6, 4,  16, 24,
    1820             :                                         16, 4, 1,  4,  6,  4, 1};
    1821          86 :     constexpr int anGaussMatrix7x7[] = {
    1822             :         1,   6,  15, 20,  15,  6,   1,   6,  36, 90,  120, 90,  36,
    1823             :         6,   15, 90, 225, 300, 225, 90,  15, 20, 120, 300, 400, 300,
    1824             :         120, 20, 15, 90,  225, 300, 225, 90, 15, 6,   36,  90,  120,
    1825             :         90,  36, 6,  1,   6,   15,  20,  15, 6,  1};
    1826             : 
    1827          86 :     const int nOXSize = poOverview->GetXSize();
    1828          86 :     const int nOYSize = poOverview->GetYSize();
    1829          86 :     const int nResYFactor = static_cast<int>(0.5 + dfYRatioDstToSrc);
    1830             : 
    1831             :     // matrix for gauss filter
    1832          86 :     if (nResYFactor <= 2)
    1833             :     {
    1834          85 :         panGaussMatrix = anGaussMatrix3x3;
    1835          85 :         nGaussMatrixDim = 3;
    1836             :     }
    1837           1 :     else if (nResYFactor <= 4)
    1838             :     {
    1839           0 :         panGaussMatrix = anGaussMatrix5x5;
    1840           0 :         nGaussMatrixDim = 5;
    1841             :     }
    1842             :     else
    1843             :     {
    1844           1 :         panGaussMatrix = anGaussMatrix7x7;
    1845           1 :         nGaussMatrixDim = 7;
    1846             :     }
    1847             : 
    1848             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    1849             :     int *panGaussMatrixDup = static_cast<int *>(
    1850             :         CPLMalloc(sizeof(int) * nGaussMatrixDim * nGaussMatrixDim));
    1851             :     memcpy(panGaussMatrixDup, panGaussMatrix,
    1852             :            sizeof(int) * nGaussMatrixDim * nGaussMatrixDim);
    1853             :     panGaussMatrix = panGaussMatrixDup;
    1854             : #endif
    1855             : 
    1856          86 :     if (!bHasNoData)
    1857          79 :         dfNoDataValue = 0.0;
    1858             : 
    1859          86 :     std::vector<GDALColorEntry> colorEntries;
    1860          86 :     int nTransparentIdx = -1;
    1861          86 :     if (poColorTable)
    1862           2 :         colorEntries = ReadColorTable(*poColorTable, nTransparentIdx);
    1863             : 
    1864             :     // Force c4 of nodata entry to 0 so that GDALFindBestEntry() identifies
    1865             :     // it as nodata value.
    1866          92 :     if (bHasNoData && dfNoDataValue >= 0.0f &&
    1867           6 :         dfNoDataValue < colorEntries.size())
    1868           0 :         colorEntries[static_cast<int>(dfNoDataValue)].c4 = 0;
    1869             : 
    1870             :     // Or if we have no explicit nodata, but a color table entry that is
    1871             :     // transparent, consider it as the nodata value.
    1872          86 :     else if (!bHasNoData && nTransparentIdx >= 0)
    1873             :     {
    1874           0 :         dfNoDataValue = nTransparentIdx;
    1875             :     }
    1876             : 
    1877          86 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    1878          86 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    1879          86 :     const int nDstXWidth = nDstXOff2 - nDstXOff;
    1880             : 
    1881             :     /* ==================================================================== */
    1882             :     /*      Loop over destination scanlines.                                */
    1883             :     /* ==================================================================== */
    1884       16488 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    1885             :     {
    1886       16402 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    1887       16402 :         int nSrcYOff2 =
    1888       16402 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc) + 1;
    1889             : 
    1890       16402 :         if (nSrcYOff < nChunkYOff)
    1891             :         {
    1892           0 :             nSrcYOff = nChunkYOff;
    1893           0 :             nSrcYOff2++;
    1894             :         }
    1895             : 
    1896       16402 :         const int iSizeY = nSrcYOff2 - nSrcYOff;
    1897       16402 :         nSrcYOff = nSrcYOff + iSizeY / 2 - nGaussMatrixDim / 2;
    1898       16402 :         nSrcYOff2 = nSrcYOff + nGaussMatrixDim;
    1899             : 
    1900       16402 :         if (nSrcYOff2 > nChunkBottomYOff ||
    1901       16359 :             (dfYRatioDstToSrc > 1 && iDstLine == nOYSize - 1))
    1902             :         {
    1903          44 :             nSrcYOff2 = std::min(nChunkBottomYOff, nSrcYOff + nGaussMatrixDim);
    1904             :         }
    1905             : 
    1906       16402 :         int nYShiftGaussMatrix = 0;
    1907       16402 :         if (nSrcYOff < nChunkYOff)
    1908             :         {
    1909           0 :             nYShiftGaussMatrix = -(nSrcYOff - nChunkYOff);
    1910           0 :             nSrcYOff = nChunkYOff;
    1911             :         }
    1912             : 
    1913       16402 :         const double *const padfSrcScanline =
    1914       16402 :             padfChunk + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1915       16402 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    1916       16402 :         if (pabyChunkNodataMask != nullptr)
    1917         152 :             pabySrcScanlineNodataMask =
    1918         152 :                 pabyChunkNodataMask + ((nSrcYOff - nChunkYOff) * nChunkXSize);
    1919             : 
    1920             :         /* --------------------------------------------------------------------
    1921             :          */
    1922             :         /*      Loop over destination pixels */
    1923             :         /* --------------------------------------------------------------------
    1924             :          */
    1925       16402 :         double *const padfDstScanline =
    1926       16402 :             padfDstBuffer + (iDstLine - nDstYOff) * nDstXWidth;
    1927     4149980 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    1928             :         {
    1929     4133580 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    1930     4133580 :             int nSrcXOff2 =
    1931     4133580 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc) + 1;
    1932             : 
    1933     4133580 :             if (nSrcXOff < nChunkXOff)
    1934             :             {
    1935           0 :                 nSrcXOff = nChunkXOff;
    1936           0 :                 nSrcXOff2++;
    1937             :             }
    1938             : 
    1939     4133580 :             const int iSizeX = nSrcXOff2 - nSrcXOff;
    1940     4133580 :             nSrcXOff = nSrcXOff + iSizeX / 2 - nGaussMatrixDim / 2;
    1941     4133580 :             nSrcXOff2 = nSrcXOff + nGaussMatrixDim;
    1942             : 
    1943     4133580 :             if (nSrcXOff2 > nChunkRightXOff ||
    1944     4127930 :                 (dfXRatioDstToSrc > 1 && iDstPixel == nOXSize - 1))
    1945             :             {
    1946        5650 :                 nSrcXOff2 =
    1947        5650 :                     std::min(nChunkRightXOff, nSrcXOff + nGaussMatrixDim);
    1948             :             }
    1949             : 
    1950     4133580 :             int nXShiftGaussMatrix = 0;
    1951     4133580 :             if (nSrcXOff < nChunkXOff)
    1952             :             {
    1953           0 :                 nXShiftGaussMatrix = -(nSrcXOff - nChunkXOff);
    1954           0 :                 nSrcXOff = nChunkXOff;
    1955             :             }
    1956             : 
    1957     4133580 :             if (poColorTable == nullptr)
    1958             :             {
    1959     4133380 :                 double dfTotal = 0.0;
    1960     4133380 :                 GInt64 nCount = 0;
    1961     4133380 :                 const int *panLineWeight =
    1962     4133380 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    1963             :                     nXShiftGaussMatrix;
    1964             : 
    1965    16527900 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    1966    12394500 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    1967             :                 {
    1968    49561300 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    1969             :                     {
    1970    37166800 :                         const double val =
    1971    37166800 :                             padfSrcScanline[iX - nChunkXOff +
    1972    37166800 :                                             static_cast<GPtrDiff_t>(iY -
    1973    37166800 :                                                                     nSrcYOff) *
    1974    37166800 :                                                 nChunkXSize];
    1975    37166800 :                         if (pabySrcScanlineNodataMask == nullptr ||
    1976       32872 :                             pabySrcScanlineNodataMask[iX - nChunkXOff +
    1977       32872 :                                                       static_cast<GPtrDiff_t>(
    1978       32872 :                                                           iY - nSrcYOff) *
    1979       32872 :                                                           nChunkXSize])
    1980             :                         {
    1981    37146100 :                             const int nWeight = panLineWeight[i];
    1982    37146100 :                             dfTotal += val * nWeight;
    1983    37146100 :                             nCount += nWeight;
    1984             :                         }
    1985             :                     }
    1986             :                 }
    1987             : 
    1988     4133380 :                 if (nCount == 0)
    1989             :                 {
    1990        2217 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    1991             :                 }
    1992             :                 else
    1993             :                 {
    1994     4131160 :                     padfDstScanline[iDstPixel - nDstXOff] = dfTotal / nCount;
    1995             :                 }
    1996             :             }
    1997             :             else
    1998             :             {
    1999         200 :                 GInt64 nTotalR = 0;
    2000         200 :                 GInt64 nTotalG = 0;
    2001         200 :                 GInt64 nTotalB = 0;
    2002         200 :                 GInt64 nTotalWeight = 0;
    2003         200 :                 const int *panLineWeight =
    2004         200 :                     panGaussMatrix + nYShiftGaussMatrix * nGaussMatrixDim +
    2005             :                     nXShiftGaussMatrix;
    2006             : 
    2007         780 :                 for (int j = 0, iY = nSrcYOff; iY < nSrcYOff2;
    2008         580 :                      ++iY, ++j, panLineWeight += nGaussMatrixDim)
    2009             :                 {
    2010        2262 :                     for (int i = 0, iX = nSrcXOff; iX < nSrcXOff2; ++iX, ++i)
    2011             :                     {
    2012        1682 :                         const double val =
    2013        1682 :                             padfSrcScanline[iX - nChunkXOff +
    2014        1682 :                                             static_cast<GPtrDiff_t>(iY -
    2015        1682 :                                                                     nSrcYOff) *
    2016        1682 :                                                 nChunkXSize];
    2017        1682 :                         if (val < 0 || val >= colorEntries.size())
    2018           0 :                             continue;
    2019             : 
    2020        1682 :                         size_t idx = static_cast<size_t>(val);
    2021        1682 :                         if (colorEntries[idx].c4)
    2022             :                         {
    2023        1682 :                             const int nWeight = panLineWeight[i];
    2024        1682 :                             nTotalR +=
    2025        1682 :                                 static_cast<GInt64>(colorEntries[idx].c1) *
    2026        1682 :                                 nWeight;
    2027        1682 :                             nTotalG +=
    2028        1682 :                                 static_cast<GInt64>(colorEntries[idx].c2) *
    2029        1682 :                                 nWeight;
    2030        1682 :                             nTotalB +=
    2031        1682 :                                 static_cast<GInt64>(colorEntries[idx].c3) *
    2032        1682 :                                 nWeight;
    2033        1682 :                             nTotalWeight += nWeight;
    2034             :                         }
    2035             :                     }
    2036             :                 }
    2037             : 
    2038         200 :                 if (nTotalWeight == 0)
    2039             :                 {
    2040           0 :                     padfDstScanline[iDstPixel - nDstXOff] = dfNoDataValue;
    2041             :                 }
    2042             :                 else
    2043             :                 {
    2044             :                     GDALColorEntry color;
    2045             : 
    2046         200 :                     color.c1 = static_cast<short>((nTotalR + nTotalWeight / 2) /
    2047             :                                                   nTotalWeight);
    2048         200 :                     color.c2 = static_cast<short>((nTotalG + nTotalWeight / 2) /
    2049             :                                                   nTotalWeight);
    2050         200 :                     color.c3 = static_cast<short>((nTotalB + nTotalWeight / 2) /
    2051             :                                                   nTotalWeight);
    2052         200 :                     padfDstScanline[iDstPixel - nDstXOff] =
    2053         200 :                         BestColorEntry(colorEntries, color);
    2054             :                 }
    2055             :             }
    2056             :         }
    2057             :     }
    2058             : 
    2059             : #ifdef DEBUG_OUT_OF_BOUND_ACCESS
    2060             :     CPLFree(panGaussMatrixDup);
    2061             : #endif
    2062             : 
    2063          86 :     return CE_None;
    2064             : }
    2065             : 
    2066             : /************************************************************************/
    2067             : /*                      GDALResampleChunk_Mode()                        */
    2068             : /************************************************************************/
    2069             : 
    2070             : template <class T>
    2071          80 : static CPLErr GDALResampleChunk_Mode_T(
    2072             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
    2073             :     double dfSrcYDelta, const T *pChunk, const GByte *pabyChunkNodataMask,
    2074             :     int nChunkXOff, int nChunkXSize, int nChunkYOff, int nChunkYSize,
    2075             :     int nDstXOff, int nDstXOff2, int nDstYOff, int nDstYOff2,
    2076             :     T *const pDstBuffer, bool bHasNoData, double dfNoDataValue,
    2077             :     GDALColorTable *poColorTable, GDALDataType eSrcDataType)
    2078             : 
    2079             : {
    2080          80 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    2081             : 
    2082             :     T tNoDataValue;
    2083          80 :     if (!bHasNoData || !GDALIsValueInRange<T>(dfNoDataValue))
    2084          79 :         tNoDataValue = 0;
    2085             :     else
    2086           1 :         tNoDataValue = static_cast<T>(dfNoDataValue);
    2087             : 
    2088          80 :     size_t nMaxNumPx = 0;
    2089          80 :     T *padfVals = nullptr;
    2090          80 :     int *panSums = nullptr;
    2091             : 
    2092          80 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    2093          80 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    2094         160 :     std::vector<int> anVals(256, 0);
    2095             : 
    2096             :     /* ==================================================================== */
    2097             :     /*      Loop over destination scanlines.                                */
    2098             :     /* ==================================================================== */
    2099        7419 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    2100             :     {
    2101        7339 :         double dfSrcYOff = dfSrcYDelta + iDstLine * dfYRatioDstToSrc;
    2102        7339 :         int nSrcYOff = static_cast<int>(dfSrcYOff + 1e-8);
    2103             : #ifdef only_pixels_with_more_than_10_pct_participation
    2104             :         // When oversampling, don't take into account pixels that have a tiny
    2105             :         // participation in the resulting pixel
    2106             :         if (dfYRatioDstToSrc > 1 && dfSrcYOff - nSrcYOff > 0.9 &&
    2107             :             nSrcYOff < nChunkBottomYOff)
    2108             :             nSrcYOff++;
    2109             : #endif
    2110        7339 :         if (nSrcYOff < nChunkYOff)
    2111           0 :             nSrcYOff = nChunkYOff;
    2112             : 
    2113        7339 :         double dfSrcYOff2 = dfSrcYDelta + (iDstLine + 1) * dfYRatioDstToSrc;
    2114        7339 :         int nSrcYOff2 = static_cast<int>(ceil(dfSrcYOff2 - 1e-8));
    2115             : #ifdef only_pixels_with_more_than_10_pct_participation
    2116             :         // When oversampling, don't take into account pixels that have a tiny
    2117             :         // participation in the resulting pixel
    2118             :         if (dfYRatioDstToSrc > 1 && nSrcYOff2 - dfSrcYOff2 > 0.9 &&
    2119             :             nSrcYOff2 > nChunkYOff)
    2120             :             nSrcYOff2--;
    2121             : #endif
    2122        7339 :         if (nSrcYOff2 == nSrcYOff)
    2123           0 :             ++nSrcYOff2;
    2124        7339 :         if (nSrcYOff2 > nChunkBottomYOff)
    2125           0 :             nSrcYOff2 = nChunkBottomYOff;
    2126             : 
    2127        7339 :         const T *const paSrcScanline =
    2128         101 :             pChunk +
    2129        7339 :             (static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize);
    2130        7339 :         const GByte *pabySrcScanlineNodataMask = nullptr;
    2131        7339 :         if (pabyChunkNodataMask != nullptr)
    2132        1810 :             pabySrcScanlineNodataMask =
    2133             :                 pabyChunkNodataMask +
    2134        1810 :                 static_cast<GPtrDiff_t>(nSrcYOff - nChunkYOff) * nChunkXSize;
    2135             : 
    2136        7339 :         T *const paDstScanline = pDstBuffer + (iDstLine - nDstYOff) * nDstXSize;
    2137             :         /* --------------------------------------------------------------------
    2138             :          */
    2139             :         /*      Loop over destination pixels */
    2140             :         /* --------------------------------------------------------------------
    2141             :          */
    2142     4259466 :         for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    2143             :         {
    2144     4252125 :             double dfSrcXOff = dfSrcXDelta + iDstPixel * dfXRatioDstToSrc;
    2145             :             // Apply some epsilon to avoid numerical precision issues
    2146     4252125 :             int nSrcXOff = static_cast<int>(dfSrcXOff + 1e-8);
    2147             : #ifdef only_pixels_with_more_than_10_pct_participation
    2148             :             // When oversampling, don't take into account pixels that have a
    2149             :             // tiny participation in the resulting pixel
    2150             :             if (dfXRatioDstToSrc > 1 && dfSrcXOff - nSrcXOff > 0.9 &&
    2151             :                 nSrcXOff < nChunkRightXOff)
    2152             :                 nSrcXOff++;
    2153             : #endif
    2154     4252125 :             if (nSrcXOff < nChunkXOff)
    2155           0 :                 nSrcXOff = nChunkXOff;
    2156             : 
    2157     4252125 :             double dfSrcXOff2 =
    2158     4252125 :                 dfSrcXDelta + (iDstPixel + 1) * dfXRatioDstToSrc;
    2159     4252125 :             int nSrcXOff2 = static_cast<int>(ceil(dfSrcXOff2 - 1e-8));
    2160             : #ifdef only_pixels_with_more_than_10_pct_participation
    2161             :             // When oversampling, don't take into account pixels that have a
    2162             :             // tiny participation in the resulting pixel
    2163             :             if (dfXRatioDstToSrc > 1 && nSrcXOff2 - dfSrcXOff2 > 0.9 &&
    2164             :                 nSrcXOff2 > nChunkXOff)
    2165             :                 nSrcXOff2--;
    2166             : #endif
    2167     4252125 :             if (nSrcXOff2 == nSrcXOff)
    2168           0 :                 nSrcXOff2++;
    2169     4252125 :             if (nSrcXOff2 > nChunkRightXOff)
    2170           0 :                 nSrcXOff2 = nChunkRightXOff;
    2171             : 
    2172     4252325 :             if (eSrcDataType != GDT_Byte ||
    2173         200 :                 (poColorTable && poColorTable->GetColorEntryCount() > 256))
    2174             :             {
    2175             :                 // Not sure how much sense it makes to run a majority
    2176             :                 // filter on floating point data, but here it is for the sake
    2177             :                 // of compatibility. It won't look right on RGB images by the
    2178             :                 // nature of the filter.
    2179             : 
    2180         775 :                 if (nSrcYOff2 - nSrcYOff <= 0 || nSrcXOff2 - nSrcXOff <= 0 ||
    2181        2325 :                     nSrcYOff2 - nSrcYOff > INT_MAX / (nSrcXOff2 - nSrcXOff) ||
    2182         775 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2183         775 :                             static_cast<size_t>(nSrcXOff2 - nSrcXOff) >
    2184         775 :                         std::numeric_limits<size_t>::max() / sizeof(float))
    2185             :                 {
    2186           0 :                     CPLError(CE_Failure, CPLE_NotSupported,
    2187             :                              "Too big downsampling factor");
    2188           0 :                     CPLFree(padfVals);
    2189           0 :                     CPLFree(panSums);
    2190           0 :                     return CE_Failure;
    2191             :                 }
    2192         775 :                 const size_t nNumPx =
    2193         775 :                     static_cast<size_t>(nSrcYOff2 - nSrcYOff) *
    2194         775 :                     static_cast<size_t>(nSrcXOff2 - nSrcXOff);
    2195         775 :                 size_t iMaxInd = 0;
    2196         775 :                 size_t iMaxVal = 0;
    2197         775 :                 bool biMaxValdValid = false;
    2198             : 
    2199         775 :                 if (padfVals == nullptr || nNumPx > nMaxNumPx)
    2200             :                 {
    2201             :                     T *padfValsNew = static_cast<T *>(
    2202          19 :                         VSI_REALLOC_VERBOSE(padfVals, nNumPx * sizeof(T)));
    2203             :                     int *panSumsNew = static_cast<int *>(
    2204          19 :                         VSI_REALLOC_VERBOSE(panSums, nNumPx * sizeof(int)));
    2205          19 :                     if (padfValsNew != nullptr)
    2206          19 :                         padfVals = padfValsNew;
    2207          19 :                     if (panSumsNew != nullptr)
    2208          19 :                         panSums = panSumsNew;
    2209          19 :                     if (padfValsNew == nullptr || panSumsNew == nullptr)
    2210             :                     {
    2211           0 :                         CPLFree(padfVals);
    2212           0 :                         CPLFree(panSums);
    2213           0 :                         return CE_Failure;
    2214             :                     }
    2215          19 :                     nMaxNumPx = nNumPx;
    2216             :                 }
    2217             : 
    2218        2325 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2219             :                 {
    2220        1550 :                     const GPtrDiff_t iTotYOff =
    2221        1550 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2222        1550 :                         nChunkXOff;
    2223        4650 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2224             :                     {
    2225        3100 :                         if (pabySrcScanlineNodataMask == nullptr ||
    2226          16 :                             pabySrcScanlineNodataMask[iX + iTotYOff])
    2227             :                         {
    2228        3085 :                             const T dfVal = paSrcScanline[iX + iTotYOff];
    2229        3085 :                             size_t i = 0;  // Used after for.
    2230             : 
    2231             :                             // Check array for existing entry.
    2232        7315 :                             for (; i < iMaxInd; ++i)
    2233        5146 :                                 if (padfVals[i] == dfVal &&
    2234         670 :                                     ++panSums[i] > panSums[iMaxVal])
    2235             :                                 {
    2236         246 :                                     iMaxVal = i;
    2237         246 :                                     biMaxValdValid = true;
    2238         246 :                                     break;
    2239             :                                 }
    2240             : 
    2241             :                             // Add to arr if entry not already there.
    2242        3085 :                             if (i == iMaxInd)
    2243             :                             {
    2244        2839 :                                 padfVals[iMaxInd] = dfVal;
    2245        2839 :                                 panSums[iMaxInd] = 1;
    2246             : 
    2247        2839 :                                 if (!biMaxValdValid)
    2248             :                                 {
    2249         772 :                                     iMaxVal = iMaxInd;
    2250         772 :                                     biMaxValdValid = true;
    2251             :                                 }
    2252             : 
    2253        2839 :                                 ++iMaxInd;
    2254             :                             }
    2255             :                         }
    2256             :                     }
    2257             :                 }
    2258             : 
    2259         775 :                 if (!biMaxValdValid)
    2260           3 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2261             :                 else
    2262         772 :                     paDstScanline[iDstPixel - nDstXOff] = padfVals[iMaxVal];
    2263             :             }
    2264             :             else  // if( eSrcDataType == GDT_Byte && nEntryCount < 256 )
    2265             :             {
    2266             :                 // So we go here for a paletted or non-paletted byte band.
    2267             :                 // The input values are then between 0 and 255.
    2268     4251350 :                 int nMaxVal = 0;
    2269     4251350 :                 int iMaxInd = -1;
    2270             : 
    2271             :                 // The cost of this zeroing might be high. Perhaps we should
    2272             :                 // just use the above generic case, and go to this one if the
    2273             :                 // number of source pixels is large enough
    2274     4251350 :                 std::fill(anVals.begin(), anVals.end(), 0);
    2275             : 
    2276    12777700 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    2277             :                 {
    2278     8526360 :                     const GPtrDiff_t iTotYOff =
    2279     8526360 :                         static_cast<GPtrDiff_t>(iY - nSrcYOff) * nChunkXSize -
    2280     8526360 :                         nChunkXOff;
    2281    25649300 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    2282             :                     {
    2283    17122900 :                         const T val = paSrcScanline[iX + iTotYOff];
    2284    17122900 :                         if (!bHasNoData || val != tNoDataValue)
    2285             :                         {
    2286    17122900 :                             int nVal = static_cast<int>(val);
    2287    17122900 :                             if (++anVals[nVal] > nMaxVal)
    2288             :                             {
    2289             :                                 // Sum the density.
    2290             :                                 // Is it the most common value so far?
    2291    17006200 :                                 iMaxInd = nVal;
    2292    17006200 :                                 nMaxVal = anVals[nVal];
    2293             :                             }
    2294             :                         }
    2295             :                     }
    2296             :                 }
    2297             : 
    2298     4251350 :                 if (iMaxInd == -1)
    2299           0 :                     paDstScanline[iDstPixel - nDstXOff] = tNoDataValue;
    2300             :                 else
    2301     4251350 :                     paDstScanline[iDstPixel - nDstXOff] =
    2302             :                         static_cast<T>(iMaxInd);
    2303             :             }
    2304             :         }
    2305             :     }
    2306             : 
    2307          80 :     CPLFree(padfVals);
    2308          80 :     CPLFree(panSums);
    2309             : 
    2310          80 :     return CE_None;
    2311             : }
    2312             : 
    2313          80 : static CPLErr GDALResampleChunk_Mode(
    2314             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
    2315             :     double dfSrcYDelta, GDALDataType eWrkDataType, const void *pChunk,
    2316             :     const GByte *pabyChunkNodataMask, int nChunkXOff, int nChunkXSize,
    2317             :     int nChunkYOff, int nChunkYSize, int nDstXOff, int nDstXOff2, int nDstYOff,
    2318             :     int nDstYOff2, GDALRasterBand * /*poOverview*/, void **ppDstBuffer,
    2319             :     GDALDataType *peDstBufferDataType, const char * /* pszResampling */,
    2320             :     bool bHasNoData, double dfNoDataValue, GDALColorTable *poColorTable,
    2321             :     GDALDataType eSrcDataType, bool /*bPropagateNoData*/)
    2322             : {
    2323          80 :     *ppDstBuffer =
    2324          80 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    2325             :                             GDALGetDataTypeSizeBytes(eWrkDataType));
    2326          80 :     if (*ppDstBuffer == nullptr)
    2327             :     {
    2328           0 :         return CE_Failure;
    2329             :     }
    2330             : 
    2331          80 :     *peDstBufferDataType = eWrkDataType;
    2332          80 :     if (eWrkDataType == GDT_Byte)
    2333             :     {
    2334          61 :         return GDALResampleChunk_Mode_T<GByte>(
    2335             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    2336             :             static_cast<const GByte *>(pChunk), pabyChunkNodataMask, nChunkXOff,
    2337             :             nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff, nDstXOff2, nDstYOff,
    2338             :             nDstYOff2, static_cast<GByte *>(*ppDstBuffer), bHasNoData,
    2339          61 :             dfNoDataValue, poColorTable, eSrcDataType);
    2340             :     }
    2341          19 :     else if (eWrkDataType == GDT_UInt16)
    2342             :     {
    2343           1 :         return GDALResampleChunk_Mode_T<GUInt16>(
    2344             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    2345             :             static_cast<const GUInt16 *>(pChunk), pabyChunkNodataMask,
    2346             :             nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    2347             :             nDstXOff2, nDstYOff, nDstYOff2,
    2348             :             static_cast<GUInt16 *>(*ppDstBuffer), bHasNoData, dfNoDataValue,
    2349           1 :             poColorTable, eSrcDataType);
    2350             :     }
    2351          18 :     else if (eWrkDataType == GDT_Float32)
    2352             :     {
    2353          16 :         return GDALResampleChunk_Mode_T<float>(
    2354             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    2355             :             static_cast<const float *>(pChunk), pabyChunkNodataMask, nChunkXOff,
    2356             :             nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff, nDstXOff2, nDstYOff,
    2357             :             nDstYOff2, static_cast<float *>(*ppDstBuffer), bHasNoData,
    2358          16 :             dfNoDataValue, poColorTable, eSrcDataType);
    2359             :     }
    2360           2 :     else if (eWrkDataType == GDT_Float64)
    2361             :     {
    2362           2 :         return GDALResampleChunk_Mode_T<double>(
    2363             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    2364             :             static_cast<const double *>(pChunk), pabyChunkNodataMask,
    2365             :             nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    2366             :             nDstXOff2, nDstYOff, nDstYOff2, static_cast<double *>(*ppDstBuffer),
    2367           2 :             bHasNoData, dfNoDataValue, poColorTable, eSrcDataType);
    2368             :     }
    2369             : 
    2370           0 :     CPLAssert(false);
    2371             :     return CE_Failure;
    2372             : }
    2373             : 
    2374             : /************************************************************************/
    2375             : /*                  GDALResampleConvolutionHorizontal()                 */
    2376             : /************************************************************************/
    2377             : 
    2378             : template <class T>
    2379             : static inline double
    2380       44642 : GDALResampleConvolutionHorizontal(const T *pChunk, const double *padfWeights,
    2381             :                                   int nSrcPixelCount)
    2382             : {
    2383       44642 :     double dfVal1 = 0.0;
    2384       44642 :     double dfVal2 = 0.0;
    2385       44642 :     int i = 0;  // Used after for.
    2386             :     // Intel Compiler 2024.0.2.29 (maybe other versions?) crashes on this
    2387             :     // manually (untypical) unrolled loop in -O2 and -O3:
    2388             :     // https://github.com/OSGeo/gdal/issues/9508
    2389             : #if !defined(__INTEL_CLANG_COMPILER)
    2390       89044 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2391             :     {
    2392       44402 :         dfVal1 += pChunk[i] * padfWeights[i];
    2393       44402 :         dfVal1 += pChunk[i + 1] * padfWeights[i + 1];
    2394       44402 :         dfVal2 += pChunk[i + 2] * padfWeights[i + 2];
    2395       44402 :         dfVal2 += pChunk[i + 3] * padfWeights[i + 3];
    2396             :     }
    2397             : #endif
    2398       46066 :     for (; i < nSrcPixelCount; ++i)
    2399             :     {
    2400        1424 :         dfVal1 += pChunk[i] * padfWeights[i];
    2401             :     }
    2402       44642 :     return dfVal1 + dfVal2;
    2403             : }
    2404             : 
    2405             : template <class T>
    2406          13 : static inline void GDALResampleConvolutionHorizontalWithMask(
    2407             :     const T *pChunk, const GByte *pabyMask, const double *padfWeights,
    2408             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2409             : {
    2410          13 :     dfVal = 0;
    2411          13 :     dfWeightSum = 0;
    2412          13 :     int i = 0;
    2413          13 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2414             :     {
    2415           0 :         const double dfWeight0 = padfWeights[i] * pabyMask[i];
    2416           0 :         const double dfWeight1 = padfWeights[i + 1] * pabyMask[i + 1];
    2417           0 :         const double dfWeight2 = padfWeights[i + 2] * pabyMask[i + 2];
    2418           0 :         const double dfWeight3 = padfWeights[i + 3] * pabyMask[i + 3];
    2419           0 :         dfVal += pChunk[i] * dfWeight0;
    2420           0 :         dfVal += pChunk[i + 1] * dfWeight1;
    2421           0 :         dfVal += pChunk[i + 2] * dfWeight2;
    2422           0 :         dfVal += pChunk[i + 3] * dfWeight3;
    2423           0 :         dfWeightSum += dfWeight0 + dfWeight1 + dfWeight2 + dfWeight3;
    2424             :     }
    2425          45 :     for (; i < nSrcPixelCount; ++i)
    2426             :     {
    2427          32 :         const double dfWeight = padfWeights[i] * pabyMask[i];
    2428          32 :         dfVal += pChunk[i] * dfWeight;
    2429          32 :         dfWeightSum += dfWeight;
    2430             :     }
    2431          13 : }
    2432             : 
    2433             : template <class T>
    2434     1330333 : static inline void GDALResampleConvolutionHorizontal_3rows(
    2435             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2436             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2437             :     double &dfRes2, double &dfRes3)
    2438             : {
    2439     1330333 :     double dfVal1 = 0.0;
    2440     1330333 :     double dfVal2 = 0.0;
    2441     1330333 :     double dfVal3 = 0.0;
    2442     1330333 :     double dfVal4 = 0.0;
    2443     1330333 :     double dfVal5 = 0.0;
    2444     1330333 :     double dfVal6 = 0.0;
    2445     1330333 :     int i = 0;  // Used after for.
    2446     2715059 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2447             :     {
    2448     1384722 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2449     1384722 :         dfVal1 += pChunkRow1[i + 1] * padfWeights[i + 1];
    2450     1384722 :         dfVal2 += pChunkRow1[i + 2] * padfWeights[i + 2];
    2451     1384722 :         dfVal2 += pChunkRow1[i + 3] * padfWeights[i + 3];
    2452     1384722 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2453     1384722 :         dfVal3 += pChunkRow2[i + 1] * padfWeights[i + 1];
    2454     1384722 :         dfVal4 += pChunkRow2[i + 2] * padfWeights[i + 2];
    2455     1384722 :         dfVal4 += pChunkRow2[i + 3] * padfWeights[i + 3];
    2456     1384722 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2457     1384722 :         dfVal5 += pChunkRow3[i + 1] * padfWeights[i + 1];
    2458     1384722 :         dfVal6 += pChunkRow3[i + 2] * padfWeights[i + 2];
    2459     1384722 :         dfVal6 += pChunkRow3[i + 3] * padfWeights[i + 3];
    2460             :     }
    2461     1366938 :     for (; i < nSrcPixelCount; ++i)
    2462             :     {
    2463       36605 :         dfVal1 += pChunkRow1[i] * padfWeights[i];
    2464       36605 :         dfVal3 += pChunkRow2[i] * padfWeights[i];
    2465       36605 :         dfVal5 += pChunkRow3[i] * padfWeights[i];
    2466             :     }
    2467     1330333 :     dfRes1 = dfVal1 + dfVal2;
    2468     1330333 :     dfRes2 = dfVal3 + dfVal4;
    2469     1330333 :     dfRes3 = dfVal5 + dfVal6;
    2470     1330333 : }
    2471             : 
    2472             : template <class T>
    2473       18187 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    2474             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2475             :     const double *padfWeights, int nSrcPixelCount, double &dfRes1,
    2476             :     double &dfRes2, double &dfRes3)
    2477             : {
    2478       18187 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2479             :                                             padfWeights, nSrcPixelCount, dfRes1,
    2480             :                                             dfRes2, dfRes3);
    2481       18187 : }
    2482             : 
    2483             : template <class T>
    2484     1247346 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows(
    2485             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2486             :     const double *padfWeights, double &dfRes1, double &dfRes2, double &dfRes3)
    2487             : {
    2488     1247346 :     GDALResampleConvolutionHorizontal_3rows(pChunkRow1, pChunkRow2, pChunkRow3,
    2489             :                                             padfWeights, 4, dfRes1, dfRes2,
    2490             :                                             dfRes3);
    2491     1247346 : }
    2492             : 
    2493             : /************************************************************************/
    2494             : /*                  GDALResampleConvolutionVertical()                   */
    2495             : /************************************************************************/
    2496             : 
    2497             : template <class T>
    2498             : static inline double
    2499      462638 : GDALResampleConvolutionVertical(const T *pChunk, int nStride,
    2500             :                                 const double *padfWeights, int nSrcLineCount)
    2501             : {
    2502      462638 :     double dfVal1 = 0.0;
    2503      462638 :     double dfVal2 = 0.0;
    2504      462638 :     int i = 0;
    2505      462638 :     int j = 0;
    2506      911564 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2507             :     {
    2508      448926 :         dfVal1 += pChunk[j] * padfWeights[i];
    2509      448926 :         dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
    2510      448926 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2511      448926 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2512             :     }
    2513      514426 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2514             :     {
    2515       51788 :         dfVal1 += pChunk[j] * padfWeights[i];
    2516             :     }
    2517      462638 :     return dfVal1 + dfVal2;
    2518             : }
    2519             : 
    2520             : template <class T>
    2521     2880000 : static inline void GDALResampleConvolutionVertical_2cols(
    2522             :     const T *pChunk, int nStride, const double *padfWeights, int nSrcLineCount,
    2523             :     double &dfRes1, double &dfRes2)
    2524             : {
    2525     2880000 :     double dfVal1 = 0.0;
    2526     2880000 :     double dfVal2 = 0.0;
    2527     2880000 :     double dfVal3 = 0.0;
    2528     2880000 :     double dfVal4 = 0.0;
    2529     2880000 :     int i = 0;
    2530     2880000 :     int j = 0;
    2531     5716800 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2532             :     {
    2533     2836800 :         dfVal1 += pChunk[j] * padfWeights[i];
    2534     2836800 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2535     2836800 :         dfVal1 += pChunk[j + nStride] * padfWeights[i + 1];
    2536     2836800 :         dfVal3 += pChunk[j + 1 + nStride] * padfWeights[i + 1];
    2537     2836800 :         dfVal2 += pChunk[j + 2 * nStride] * padfWeights[i + 2];
    2538     2836800 :         dfVal4 += pChunk[j + 1 + 2 * nStride] * padfWeights[i + 2];
    2539     2836800 :         dfVal2 += pChunk[j + 3 * nStride] * padfWeights[i + 3];
    2540     2836800 :         dfVal4 += pChunk[j + 1 + 3 * nStride] * padfWeights[i + 3];
    2541             :     }
    2542     2995210 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2543             :     {
    2544      115210 :         dfVal1 += pChunk[j] * padfWeights[i];
    2545      115210 :         dfVal3 += pChunk[j + 1] * padfWeights[i];
    2546             :     }
    2547     2880000 :     dfRes1 = dfVal1 + dfVal2;
    2548     2880000 :     dfRes2 = dfVal3 + dfVal4;
    2549     2880000 : }
    2550             : 
    2551             : #ifdef USE_SSE2
    2552             : 
    2553             : #ifdef __AVX__
    2554             : /************************************************************************/
    2555             : /*             GDALResampleConvolutionVertical_16cols<T>                */
    2556             : /************************************************************************/
    2557             : 
    2558             : template <class T>
    2559             : static inline void
    2560             : GDALResampleConvolutionVertical_16cols(const T *pChunk, int nStride,
    2561             :                                        const double *padfWeights,
    2562             :                                        int nSrcLineCount, float *afDest)
    2563             : {
    2564             :     int i = 0;
    2565             :     int j = 0;
    2566             :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2567             :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2568             :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2569             :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2570             :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2571             :     {
    2572             :         XMMReg4Double w0 =
    2573             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2574             :         XMMReg4Double w1 =
    2575             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2576             :         XMMReg4Double w2 =
    2577             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2578             :         XMMReg4Double w3 =
    2579             :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2580             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2581             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2582             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 0 * nStride) * w0;
    2583             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 0 * nStride) * w0;
    2584             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2585             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2586             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 1 * nStride) * w1;
    2587             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 1 * nStride) * w1;
    2588             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2589             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2590             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 2 * nStride) * w2;
    2591             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 2 * nStride) * w2;
    2592             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2593             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2594             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8 + 3 * nStride) * w3;
    2595             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12 + 3 * nStride) * w3;
    2596             :     }
    2597             :     for (; i < nSrcLineCount; ++i, j += nStride)
    2598             :     {
    2599             :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2600             :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2601             :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2602             :         v_acc2 += XMMReg4Double::Load4Val(pChunk + j + 8) * w;
    2603             :         v_acc3 += XMMReg4Double::Load4Val(pChunk + j + 12) * w;
    2604             :     }
    2605             :     v_acc0.Store4Val(afDest);
    2606             :     v_acc1.Store4Val(afDest + 4);
    2607             :     v_acc2.Store4Val(afDest + 8);
    2608             :     v_acc3.Store4Val(afDest + 12);
    2609             : }
    2610             : 
    2611             : template <class T>
    2612             : static inline void GDALResampleConvolutionVertical_16cols(const T *, int,
    2613             :                                                           const double *, int,
    2614             :                                                           double *)
    2615             : {
    2616             :     // Cannot be reached
    2617             :     CPLAssert(false);
    2618             : }
    2619             : 
    2620             : #else
    2621             : 
    2622             : /************************************************************************/
    2623             : /*              GDALResampleConvolutionVertical_8cols<T>                */
    2624             : /************************************************************************/
    2625             : 
    2626             : template <class T>
    2627             : static inline void
    2628    18601600 : GDALResampleConvolutionVertical_8cols(const T *pChunk, int nStride,
    2629             :                                       const double *padfWeights,
    2630             :                                       int nSrcLineCount, float *afDest)
    2631             : {
    2632    18601600 :     int i = 0;
    2633    18601600 :     int j = 0;
    2634    18601600 :     XMMReg4Double v_acc0 = XMMReg4Double::Zero();
    2635    18467700 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2636    33692600 :     for (; i + 3 < nSrcLineCount; i += 4, j += 4 * nStride)
    2637             :     {
    2638    15168300 :         XMMReg4Double w0 =
    2639    15168300 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 0);
    2640    15114600 :         XMMReg4Double w1 =
    2641    15114600 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 1);
    2642    15106700 :         XMMReg4Double w2 =
    2643    15106700 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 2);
    2644    15116900 :         XMMReg4Double w3 =
    2645    15116900 :             XMMReg4Double::Load1ValHighAndLow(padfWeights + i + 3);
    2646    15142900 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 0 * nStride) * w0;
    2647    15084600 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 0 * nStride) * w0;
    2648    15084400 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 1 * nStride) * w1;
    2649    15115300 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 1 * nStride) * w1;
    2650    15113000 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 2 * nStride) * w2;
    2651    15113800 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 2 * nStride) * w2;
    2652    15123000 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0 + 3 * nStride) * w3;
    2653    15116500 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4 + 3 * nStride) * w3;
    2654             :     }
    2655    29914900 :     for (; i < nSrcLineCount; ++i, j += nStride)
    2656             :     {
    2657    11390600 :         XMMReg4Double w = XMMReg4Double::Load1ValHighAndLow(padfWeights + i);
    2658    11390600 :         v_acc0 += XMMReg4Double::Load4Val(pChunk + j + 0) * w;
    2659    11390600 :         v_acc1 += XMMReg4Double::Load4Val(pChunk + j + 4) * w;
    2660             :     }
    2661    18524300 :     v_acc0.Store4Val(afDest);
    2662    18545100 :     v_acc1.Store4Val(afDest + 4);
    2663    18579000 : }
    2664             : 
    2665             : template <class T>
    2666             : static inline void GDALResampleConvolutionVertical_8cols(const T *, int,
    2667             :                                                          const double *, int,
    2668             :                                                          double *)
    2669             : {
    2670             :     // Cannot be reached
    2671             :     CPLAssert(false);
    2672             : }
    2673             : 
    2674             : #endif  // __AVX__
    2675             : 
    2676             : /************************************************************************/
    2677             : /*              GDALResampleConvolutionHorizontalSSE2<T>                */
    2678             : /************************************************************************/
    2679             : 
    2680             : template <class T>
    2681     2736394 : static inline double GDALResampleConvolutionHorizontalSSE2(
    2682             :     const T *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2683             : {
    2684     2736394 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2685     2735919 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2686     2736316 :     int i = 0;  // Used after for.
    2687     2811917 :     for (; i + 7 < nSrcPixelCount; i += 8)
    2688             :     {
    2689             :         // Retrieve the pixel & accumulate
    2690       75571 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunk + i);
    2691       75571 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunk + i + 4);
    2692       75571 :         const XMMReg4Double v_weight1 =
    2693       75571 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2694       75571 :         const XMMReg4Double v_weight2 =
    2695       75571 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2696             : 
    2697       75571 :         v_acc1 += v_pixels1 * v_weight1;
    2698       75571 :         v_acc2 += v_pixels2 * v_weight2;
    2699             :     }
    2700             : 
    2701     2736345 :     v_acc1 += v_acc2;
    2702             : 
    2703     2736020 :     double dfVal = v_acc1.GetHorizSum();
    2704     9501510 :     for (; i < nSrcPixelCount; ++i)
    2705             :     {
    2706     6765520 :         dfVal += pChunk[i] * padfWeightsAligned[i];
    2707             :     }
    2708     2735983 :     return dfVal;
    2709             : }
    2710             : 
    2711             : /************************************************************************/
    2712             : /*              GDALResampleConvolutionHorizontal<GByte>                */
    2713             : /************************************************************************/
    2714             : 
    2715             : template <>
    2716     2188440 : inline double GDALResampleConvolutionHorizontal<GByte>(
    2717             :     const GByte *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2718             : {
    2719     2188440 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2720     2188480 :                                                  nSrcPixelCount);
    2721             : }
    2722             : 
    2723             : template <>
    2724      548283 : inline double GDALResampleConvolutionHorizontal<GUInt16>(
    2725             :     const GUInt16 *pChunk, const double *padfWeightsAligned, int nSrcPixelCount)
    2726             : {
    2727      548283 :     return GDALResampleConvolutionHorizontalSSE2(pChunk, padfWeightsAligned,
    2728      548492 :                                                  nSrcPixelCount);
    2729             : }
    2730             : 
    2731             : /************************************************************************/
    2732             : /*              GDALResampleConvolutionHorizontalWithMaskSSE2<T>        */
    2733             : /************************************************************************/
    2734             : 
    2735             : template <class T>
    2736     4582263 : static inline void GDALResampleConvolutionHorizontalWithMaskSSE2(
    2737             :     const T *pChunk, const GByte *pabyMask, const double *padfWeightsAligned,
    2738             :     int nSrcPixelCount, double &dfVal, double &dfWeightSum)
    2739             : {
    2740     4582263 :     int i = 0;  // Used after for.
    2741     4582263 :     XMMReg4Double v_acc = XMMReg4Double::Zero();
    2742     4582263 :     XMMReg4Double v_acc_weight = XMMReg4Double::Zero();
    2743    11403121 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2744             :     {
    2745     6820848 :         const XMMReg4Double v_pixels = XMMReg4Double::Load4Val(pChunk + i);
    2746     6820848 :         const XMMReg4Double v_mask = XMMReg4Double::Load4Val(pabyMask + i);
    2747     6820848 :         XMMReg4Double v_weight =
    2748     6820848 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2749     6820848 :         v_weight *= v_mask;
    2750     6820848 :         v_acc += v_pixels * v_weight;
    2751     6820848 :         v_acc_weight += v_weight;
    2752             :     }
    2753             : 
    2754     4582263 :     dfVal = v_acc.GetHorizSum();
    2755     4582263 :     dfWeightSum = v_acc_weight.GetHorizSum();
    2756     4780123 :     for (; i < nSrcPixelCount; ++i)
    2757             :     {
    2758      197860 :         const double dfWeight = padfWeightsAligned[i] * pabyMask[i];
    2759      197860 :         dfVal += pChunk[i] * dfWeight;
    2760      197860 :         dfWeightSum += dfWeight;
    2761             :     }
    2762     4582263 : }
    2763             : 
    2764             : /************************************************************************/
    2765             : /*              GDALResampleConvolutionHorizontalWithMask<GByte>        */
    2766             : /************************************************************************/
    2767             : 
    2768             : template <>
    2769     4582200 : inline void GDALResampleConvolutionHorizontalWithMask<GByte>(
    2770             :     const GByte *pChunk, const GByte *pabyMask,
    2771             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2772             :     double &dfWeightSum)
    2773             : {
    2774     4582200 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2775             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2776             :         dfWeightSum);
    2777     4582200 : }
    2778             : 
    2779             : template <>
    2780          63 : inline void GDALResampleConvolutionHorizontalWithMask<GUInt16>(
    2781             :     const GUInt16 *pChunk, const GByte *pabyMask,
    2782             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfVal,
    2783             :     double &dfWeightSum)
    2784             : {
    2785          63 :     GDALResampleConvolutionHorizontalWithMaskSSE2(
    2786             :         pChunk, pabyMask, padfWeightsAligned, nSrcPixelCount, dfVal,
    2787             :         dfWeightSum);
    2788          63 : }
    2789             : 
    2790             : /************************************************************************/
    2791             : /*              GDALResampleConvolutionHorizontal_3rows_SSE2<T>         */
    2792             : /************************************************************************/
    2793             : 
    2794             : template <class T>
    2795    10023630 : static inline void GDALResampleConvolutionHorizontal_3rows_SSE2(
    2796             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2797             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2798             :     double &dfRes2, double &dfRes3)
    2799             : {
    2800    10023630 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero(),
    2801    10023630 :                   v_acc2 = XMMReg4Double::Zero(),
    2802    10023630 :                   v_acc3 = XMMReg4Double::Zero();
    2803    10023630 :     int i = 0;
    2804    19989466 :     for (; i + 7 < nSrcPixelCount; i += 8)
    2805             :     {
    2806             :         // Retrieve the pixel & accumulate.
    2807     9965826 :         XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    2808     9965826 :         XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow1 + i + 4);
    2809     9965826 :         const XMMReg4Double v_weight1 =
    2810     9965826 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2811     9965826 :         const XMMReg4Double v_weight2 =
    2812     9965826 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i + 4);
    2813             : 
    2814     9965826 :         v_acc1 += v_pixels1 * v_weight1;
    2815     9965826 :         v_acc1 += v_pixels2 * v_weight2;
    2816             : 
    2817     9965826 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    2818     9965826 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i + 4);
    2819     9965826 :         v_acc2 += v_pixels1 * v_weight1;
    2820     9965826 :         v_acc2 += v_pixels2 * v_weight2;
    2821             : 
    2822     9965826 :         v_pixels1 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    2823     9965826 :         v_pixels2 = XMMReg4Double::Load4Val(pChunkRow3 + i + 4);
    2824     9965826 :         v_acc3 += v_pixels1 * v_weight1;
    2825     9965826 :         v_acc3 += v_pixels2 * v_weight2;
    2826             :     }
    2827             : 
    2828    10023630 :     dfRes1 = v_acc1.GetHorizSum();
    2829    10023630 :     dfRes2 = v_acc2.GetHorizSum();
    2830    10023630 :     dfRes3 = v_acc3.GetHorizSum();
    2831    21487226 :     for (; i < nSrcPixelCount; ++i)
    2832             :     {
    2833    11463596 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    2834    11463596 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    2835    11463596 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    2836             :     }
    2837    10023630 : }
    2838             : 
    2839             : /************************************************************************/
    2840             : /*              GDALResampleConvolutionHorizontal_3rows<GByte>          */
    2841             : /************************************************************************/
    2842             : 
    2843             : template <>
    2844    10023600 : inline void GDALResampleConvolutionHorizontal_3rows<GByte>(
    2845             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    2846             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2847             :     double &dfRes2, double &dfRes3)
    2848             : {
    2849    10023600 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2850             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2851             :         dfRes1, dfRes2, dfRes3);
    2852    10023600 : }
    2853             : 
    2854             : template <>
    2855          30 : inline void GDALResampleConvolutionHorizontal_3rows<GUInt16>(
    2856             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    2857             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    2858             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    2859             : {
    2860          30 :     GDALResampleConvolutionHorizontal_3rows_SSE2(
    2861             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2862             :         dfRes1, dfRes2, dfRes3);
    2863          30 : }
    2864             : 
    2865             : /************************************************************************/
    2866             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2<T>   */
    2867             : /************************************************************************/
    2868             : 
    2869             : template <class T>
    2870     2173103 : static inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    2871             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2872             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2873             :     double &dfRes2, double &dfRes3)
    2874             : {
    2875     2173103 :     XMMReg4Double v_acc1 = XMMReg4Double::Zero();
    2876     2172788 :     XMMReg4Double v_acc2 = XMMReg4Double::Zero();
    2877     2172943 :     XMMReg4Double v_acc3 = XMMReg4Double::Zero();
    2878     2172989 :     int i = 0;  // Use after for.
    2879     2176255 :     for (; i + 3 < nSrcPixelCount; i += 4)
    2880             :     {
    2881             :         // Retrieve the pixel & accumulate.
    2882        3236 :         const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1 + i);
    2883        3236 :         const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2 + i);
    2884        3236 :         const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3 + i);
    2885        3236 :         const XMMReg4Double v_weight =
    2886        3236 :             XMMReg4Double::Load4ValAligned(padfWeightsAligned + i);
    2887             : 
    2888        3236 :         v_acc1 += v_pixels1 * v_weight;
    2889        3236 :         v_acc2 += v_pixels2 * v_weight;
    2890        3236 :         v_acc3 += v_pixels3 * v_weight;
    2891             :     }
    2892             : 
    2893     2173025 :     dfRes1 = v_acc1.GetHorizSum();
    2894     2172783 :     dfRes2 = v_acc2.GetHorizSum();
    2895     2172831 :     dfRes3 = v_acc3.GetHorizSum();
    2896             : 
    2897     6493540 :     for (; i < nSrcPixelCount; ++i)
    2898             :     {
    2899     4320664 :         dfRes1 += pChunkRow1[i] * padfWeightsAligned[i];
    2900     4320664 :         dfRes2 += pChunkRow2[i] * padfWeightsAligned[i];
    2901     4320664 :         dfRes3 += pChunkRow3[i] * padfWeightsAligned[i];
    2902             :     }
    2903     2172886 : }
    2904             : 
    2905             : /************************************************************************/
    2906             : /*     GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>    */
    2907             : /************************************************************************/
    2908             : 
    2909             : template <>
    2910     2106280 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GByte>(
    2911             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    2912             :     const double *padfWeightsAligned, int nSrcPixelCount, double &dfRes1,
    2913             :     double &dfRes2, double &dfRes3)
    2914             : {
    2915     2106280 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    2916             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2917             :         dfRes1, dfRes2, dfRes3);
    2918     2106310 : }
    2919             : 
    2920             : template <>
    2921       66660 : inline void GDALResampleConvolutionHorizontalPixelCountLess8_3rows<GUInt16>(
    2922             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    2923             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned,
    2924             :     int nSrcPixelCount, double &dfRes1, double &dfRes2, double &dfRes3)
    2925             : {
    2926       66660 :     GDALResampleConvolutionHorizontalPixelCountLess8_3rows_SSE2(
    2927             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, nSrcPixelCount,
    2928             :         dfRes1, dfRes2, dfRes3);
    2929       66849 : }
    2930             : 
    2931             : /************************************************************************/
    2932             : /*     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2<T>       */
    2933             : /************************************************************************/
    2934             : 
    2935             : template <class T>
    2936    12211540 : static inline void GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    2937             :     const T *pChunkRow1, const T *pChunkRow2, const T *pChunkRow3,
    2938             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    2939             :     double &dfRes3)
    2940             : {
    2941    12211540 :     const XMMReg4Double v_weight =
    2942             :         XMMReg4Double::Load4ValAligned(padfWeightsAligned);
    2943             : 
    2944             :     // Retrieve the pixel & accumulate.
    2945    12129440 :     const XMMReg4Double v_pixels1 = XMMReg4Double::Load4Val(pChunkRow1);
    2946    12138510 :     const XMMReg4Double v_pixels2 = XMMReg4Double::Load4Val(pChunkRow2);
    2947    12190660 :     const XMMReg4Double v_pixels3 = XMMReg4Double::Load4Val(pChunkRow3);
    2948             : 
    2949    12223880 :     XMMReg4Double v_acc1 = v_pixels1 * v_weight;
    2950    12098420 :     XMMReg4Double v_acc2 = v_pixels2 * v_weight;
    2951    12139450 :     XMMReg4Double v_acc3 = v_pixels3 * v_weight;
    2952             : 
    2953    12147690 :     dfRes1 = v_acc1.GetHorizSum();
    2954    12158770 :     dfRes2 = v_acc2.GetHorizSum();
    2955    12184480 :     dfRes3 = v_acc3.GetHorizSum();
    2956    12177780 : }
    2957             : 
    2958             : /************************************************************************/
    2959             : /*       GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>      */
    2960             : /************************************************************************/
    2961             : 
    2962             : template <>
    2963     6620250 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GByte>(
    2964             :     const GByte *pChunkRow1, const GByte *pChunkRow2, const GByte *pChunkRow3,
    2965             :     const double *padfWeightsAligned, double &dfRes1, double &dfRes2,
    2966             :     double &dfRes3)
    2967             : {
    2968     6620250 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    2969             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    2970             :         dfRes3);
    2971     6614630 : }
    2972             : 
    2973             : template <>
    2974     5585650 : inline void GDALResampleConvolutionHorizontalPixelCount4_3rows<GUInt16>(
    2975             :     const GUInt16 *pChunkRow1, const GUInt16 *pChunkRow2,
    2976             :     const GUInt16 *pChunkRow3, const double *padfWeightsAligned, double &dfRes1,
    2977             :     double &dfRes2, double &dfRes3)
    2978             : {
    2979     5585650 :     GDALResampleConvolutionHorizontalPixelCount4_3rows_SSE2(
    2980             :         pChunkRow1, pChunkRow2, pChunkRow3, padfWeightsAligned, dfRes1, dfRes2,
    2981             :         dfRes3);
    2982     5574190 : }
    2983             : 
    2984             : #endif  // USE_SSE2
    2985             : 
    2986             : /************************************************************************/
    2987             : /*                    GDALResampleChunk_Convolution()                   */
    2988             : /************************************************************************/
    2989             : 
    2990             : template <class T, class Twork, GDALDataType eWrkDataType>
    2991        3593 : static CPLErr GDALResampleChunk_ConvolutionT(
    2992             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
    2993             :     double dfSrcYDelta, const T *pChunk, int nBands,
    2994             :     const GByte *pabyChunkNodataMask, int nChunkXOff, int nChunkXSize,
    2995             :     int nChunkYOff, int nChunkYSize, int nDstXOff, int nDstXOff2, int nDstYOff,
    2996             :     int nDstYOff2, GDALRasterBand *poDstBand, void *pDstBuffer, bool bHasNoData,
    2997             :     double dfNoDataValue, FilterFuncType pfnFilterFunc,
    2998             :     FilterFunc4ValuesType pfnFilterFunc4Values, int nKernelRadius,
    2999             :     bool bKernelWithNegativeWeights, float fMaxVal)
    3000             : 
    3001             : {
    3002        3593 :     if (!bHasNoData)
    3003        3559 :         dfNoDataValue = 0.0;
    3004        3593 :     const auto dstDataType = poDstBand->GetRasterDataType();
    3005        3591 :     const int nDstDataTypeSize = GDALGetDataTypeSizeBytes(dstDataType);
    3006        3596 :     const double dfReplacementVal =
    3007          39 :         bHasNoData ? GDALGetNoDataReplacementValue(dstDataType, dfNoDataValue)
    3008             :                    : dfNoDataValue;
    3009             :     // cppcheck-suppress unreadVariable
    3010        3596 :     const int isIntegerDT = GDALDataTypeIsInteger(dstDataType);
    3011        3595 :     const auto nNodataValueInt64 = static_cast<GInt64>(dfNoDataValue);
    3012             : 
    3013             :     // TODO: we should have some generic function to do this.
    3014        3595 :     Twork fDstMin = -std::numeric_limits<Twork>::max();
    3015        3595 :     Twork fDstMax = std::numeric_limits<Twork>::max();
    3016        3595 :     if (dstDataType == GDT_Byte)
    3017             :     {
    3018        2886 :         fDstMin = std::numeric_limits<GByte>::min();
    3019        2884 :         fDstMax = std::numeric_limits<GByte>::max();
    3020             :     }
    3021         709 :     else if (dstDataType == GDT_Int8)
    3022             :     {
    3023           0 :         fDstMin = std::numeric_limits<GInt8>::min();
    3024           0 :         fDstMax = std::numeric_limits<GInt8>::max();
    3025             :     }
    3026         709 :     else if (dstDataType == GDT_UInt16)
    3027             :     {
    3028         386 :         fDstMin = std::numeric_limits<GUInt16>::min();
    3029         385 :         fDstMax = std::numeric_limits<GUInt16>::max();
    3030             :     }
    3031         323 :     else if (dstDataType == GDT_Int16)
    3032             :     {
    3033         278 :         fDstMin = std::numeric_limits<GInt16>::min();
    3034         278 :         fDstMax = std::numeric_limits<GInt16>::max();
    3035             :     }
    3036          45 :     else if (dstDataType == GDT_UInt32)
    3037             :     {
    3038           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GUInt32>::min());
    3039           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GUInt32>::max());
    3040             :     }
    3041          44 :     else if (dstDataType == GDT_Int32)
    3042             :     {
    3043             :         // cppcheck-suppress unreadVariable
    3044           1 :         fDstMin = static_cast<Twork>(std::numeric_limits<GInt32>::min());
    3045             :         // cppcheck-suppress unreadVariable
    3046           1 :         fDstMax = static_cast<Twork>(std::numeric_limits<GInt32>::max());
    3047             :     }
    3048             : 
    3049    26993710 :     auto replaceValIfNodata = [bHasNoData, isIntegerDT, fDstMin, fDstMax,
    3050             :                                nNodataValueInt64, dfNoDataValue,
    3051             :                                dfReplacementVal](Twork fVal)
    3052             :     {
    3053    14083600 :         if (!bHasNoData)
    3054    10857300 :             return fVal;
    3055             : 
    3056             :         // Clamp value before comparing to nodata: this is only needed for
    3057             :         // kernels with negative weights (Lanczos)
    3058     3226360 :         Twork fClamped = fVal;
    3059     3226360 :         if (fClamped < fDstMin)
    3060       12874 :             fClamped = fDstMin;
    3061     3213490 :         else if (fClamped > fDstMax)
    3062       12852 :             fClamped = fDstMax;
    3063     3226360 :         if (isIntegerDT)
    3064             :         {
    3065     3226360 :             if (nNodataValueInt64 == static_cast<GInt64>(std::round(fClamped)))
    3066             :             {
    3067             :                 // Do not use the nodata value
    3068       13869 :                 return static_cast<Twork>(dfReplacementVal);
    3069             :             }
    3070             :         }
    3071           4 :         else if (dfNoDataValue == fClamped)
    3072             :         {
    3073             :             // Do not use the nodata value
    3074           1 :             return static_cast<Twork>(dfReplacementVal);
    3075             :         }
    3076     3212490 :         return fClamped;
    3077             :     };
    3078             : 
    3079             :     /* -------------------------------------------------------------------- */
    3080             :     /*      Allocate work buffers.                                          */
    3081             :     /* -------------------------------------------------------------------- */
    3082        3595 :     const int nDstXSize = nDstXOff2 - nDstXOff;
    3083        3595 :     Twork *pafWrkScanline = nullptr;
    3084        3595 :     if (dstDataType != eWrkDataType)
    3085             :     {
    3086             :         pafWrkScanline =
    3087        3558 :             static_cast<Twork *>(VSI_MALLOC2_VERBOSE(nDstXSize, sizeof(Twork)));
    3088        3563 :         if (pafWrkScanline == nullptr)
    3089           0 :             return CE_Failure;
    3090             :     }
    3091             : 
    3092        3600 :     const double dfXScale = 1.0 / dfXRatioDstToSrc;
    3093        3600 :     const double dfXScaleWeight = (dfXScale >= 1.0) ? 1.0 : dfXScale;
    3094        3600 :     const double dfXScaledRadius = nKernelRadius / dfXScaleWeight;
    3095        3600 :     const double dfYScale = 1.0 / dfYRatioDstToSrc;
    3096        3600 :     const double dfYScaleWeight = (dfYScale >= 1.0) ? 1.0 : dfYScale;
    3097        3600 :     const double dfYScaledRadius = nKernelRadius / dfYScaleWeight;
    3098             : 
    3099             :     // Temporary array to store result of horizontal filter.
    3100             :     double *padfHorizontalFiltered = static_cast<double *>(
    3101        3600 :         VSI_MALLOC3_VERBOSE(nChunkYSize, nDstXSize, sizeof(double) * nBands));
    3102             : 
    3103             :     // To store convolution coefficients.
    3104        3601 :     double *padfWeights = static_cast<double *>(VSI_MALLOC_ALIGNED_AUTO_VERBOSE(
    3105             :         static_cast<int>(2 + 2 * std::max(dfXScaledRadius, dfYScaledRadius) +
    3106             :                          0.5) *
    3107             :         sizeof(double)));
    3108             : 
    3109        3594 :     GByte *pabyChunkNodataMaskHorizontalFiltered = nullptr;
    3110        3594 :     if (pabyChunkNodataMask)
    3111             :         pabyChunkNodataMaskHorizontalFiltered =
    3112         342 :             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nChunkYSize, nDstXSize));
    3113        3594 :     if (padfHorizontalFiltered == nullptr || padfWeights == nullptr ||
    3114         342 :         (pabyChunkNodataMask != nullptr &&
    3115             :          pabyChunkNodataMaskHorizontalFiltered == nullptr))
    3116             :     {
    3117           3 :         VSIFree(pafWrkScanline);
    3118           0 :         VSIFree(padfHorizontalFiltered);
    3119           0 :         VSIFreeAligned(padfWeights);
    3120           0 :         VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3121           0 :         return CE_Failure;
    3122             :     }
    3123             : 
    3124             :     /* ==================================================================== */
    3125             :     /*      First pass: horizontal filter                                   */
    3126             :     /* ==================================================================== */
    3127        3591 :     const int nChunkRightXOff = nChunkXOff + nChunkXSize;
    3128             : #ifdef USE_SSE2
    3129        3591 :     bool bSrcPixelCountLess8 = dfXScaledRadius < 4;
    3130             : #endif
    3131     2711634 :     for (int iDstPixel = nDstXOff; iDstPixel < nDstXOff2; ++iDstPixel)
    3132             :     {
    3133     2708036 :         const double dfSrcPixel =
    3134     2708036 :             (iDstPixel + 0.5) * dfXRatioDstToSrc + dfSrcXDelta;
    3135     2708036 :         int nSrcPixelStart =
    3136     2708036 :             static_cast<int>(floor(dfSrcPixel - dfXScaledRadius + 0.5));
    3137     2708036 :         if (nSrcPixelStart < nChunkXOff)
    3138       54479 :             nSrcPixelStart = nChunkXOff;
    3139     2708036 :         int nSrcPixelStop =
    3140     2708036 :             static_cast<int>(dfSrcPixel + dfXScaledRadius + 0.5);
    3141     2708036 :         if (nSrcPixelStop > nChunkRightXOff)
    3142       54492 :             nSrcPixelStop = nChunkRightXOff;
    3143             : #if 0
    3144             :         if( nSrcPixelStart < nChunkXOff && nChunkXOff > 0 )
    3145             :         {
    3146             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3147             :         }
    3148             :         if( nSrcPixelStop > nChunkRightXOff && nChunkRightXOff < nSrcWidth )
    3149             :         {
    3150             :             printf( "truncated iDstPixel = %d\n", iDstPixel );/*ok*/
    3151             :         }
    3152             : #endif
    3153     2708036 :         const int nSrcPixelCount = nSrcPixelStop - nSrcPixelStart;
    3154     2708036 :         double dfWeightSum = 0.0;
    3155             : 
    3156             :         // Compute convolution coefficients.
    3157     2708036 :         int nSrcPixel = nSrcPixelStart;
    3158     2708036 :         double dfX = dfXScaleWeight * (nSrcPixel - dfSrcPixel + 0.5);
    3159     3519436 :         for (; nSrcPixel + 3 < nSrcPixelStop; nSrcPixel += 4)
    3160             :         {
    3161      811506 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfX;
    3162      811506 :             dfX += dfXScaleWeight;
    3163      811506 :             padfWeights[nSrcPixel + 1 - nSrcPixelStart] = dfX;
    3164      811506 :             dfX += dfXScaleWeight;
    3165      811506 :             padfWeights[nSrcPixel + 2 - nSrcPixelStart] = dfX;
    3166      811506 :             dfX += dfXScaleWeight;
    3167      811506 :             padfWeights[nSrcPixel + 3 - nSrcPixelStart] = dfX;
    3168      811506 :             dfX += dfXScaleWeight;
    3169      811408 :             dfWeightSum +=
    3170      811506 :                 pfnFilterFunc4Values(padfWeights + nSrcPixel - nSrcPixelStart);
    3171             :         }
    3172     6687365 :         for (; nSrcPixel < nSrcPixelStop; ++nSrcPixel, dfX += dfXScaleWeight)
    3173             :         {
    3174     3979751 :             const double dfWeight = pfnFilterFunc(dfX);
    3175     3979434 :             padfWeights[nSrcPixel - nSrcPixelStart] = dfWeight;
    3176     3979434 :             dfWeightSum += dfWeight;
    3177             :         }
    3178             : 
    3179     2707614 :         const int nHeight = nChunkYSize * nBands;
    3180     2707614 :         if (pabyChunkNodataMask == nullptr)
    3181             :         {
    3182     2647397 :             if (dfWeightSum != 0)
    3183             :             {
    3184     2647304 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3185     9449510 :                 for (int i = 0; i < nSrcPixelCount; ++i)
    3186     6802203 :                     padfWeights[i] *= dfInvWeightSum;
    3187             :             }
    3188     2647397 :             int iSrcLineOff = 0;
    3189             : #ifdef USE_SSE2
    3190     2647397 :             if (nSrcPixelCount == 4)
    3191             :             {
    3192    13963046 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3193             :                 {
    3194    13399166 :                     const GPtrDiff_t j =
    3195    13399166 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3196    13399166 :                         (nSrcPixelStart - nChunkXOff);
    3197    13399166 :                     double dfVal1 = 0.0;
    3198    13399166 :                     double dfVal2 = 0.0;
    3199    13399166 :                     double dfVal3 = 0.0;
    3200    13399166 :                     GDALResampleConvolutionHorizontalPixelCount4_3rows(
    3201    13399166 :                         pChunk + j, pChunk + j + nChunkXSize,
    3202    13399166 :                         pChunk + j + 2 * nChunkXSize, padfWeights, dfVal1,
    3203             :                         dfVal2, dfVal3);
    3204    13427716 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3205    13427716 :                                                nDstXSize +
    3206    13427716 :                                            iDstPixel - nDstXOff] = dfVal1;
    3207    13427716 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3208    13427716 :                                             1) *
    3209    13427716 :                                                nDstXSize +
    3210    13427716 :                                            iDstPixel - nDstXOff] = dfVal2;
    3211    13427716 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3212    13427716 :                                             2) *
    3213    13427716 :                                                nDstXSize +
    3214    13427716 :                                            iDstPixel - nDstXOff] = dfVal3;
    3215             :                 }
    3216             :             }
    3217     2112058 :             else if (bSrcPixelCountLess8)
    3218             :             {
    3219     4225011 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3220             :                 {
    3221     2190968 :                     const GPtrDiff_t j =
    3222     2190968 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3223     2190968 :                         (nSrcPixelStart - nChunkXOff);
    3224     2190968 :                     double dfVal1 = 0.0;
    3225     2190968 :                     double dfVal2 = 0.0;
    3226     2190968 :                     double dfVal3 = 0.0;
    3227     2190968 :                     GDALResampleConvolutionHorizontalPixelCountLess8_3rows(
    3228     2190968 :                         pChunk + j, pChunk + j + nChunkXSize,
    3229     2190968 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3230             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3231     2191327 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3232     2191327 :                                                nDstXSize +
    3233     2191327 :                                            iDstPixel - nDstXOff] = dfVal1;
    3234     2191327 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3235     2191327 :                                             1) *
    3236     2191327 :                                                nDstXSize +
    3237     2191327 :                                            iDstPixel - nDstXOff] = dfVal2;
    3238     2191327 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3239     2191327 :                                             2) *
    3240     2191327 :                                                nDstXSize +
    3241     2191327 :                                            iDstPixel - nDstXOff] = dfVal3;
    3242             :                 }
    3243             :             }
    3244             :             else
    3245             : #endif
    3246             :             {
    3247    10166834 :                 for (; iSrcLineOff + 2 < nHeight; iSrcLineOff += 3)
    3248             :                 {
    3249    10088430 :                     const GPtrDiff_t j =
    3250    10088430 :                         static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3251    10088430 :                         (nSrcPixelStart - nChunkXOff);
    3252    10088430 :                     double dfVal1 = 0.0;
    3253    10088430 :                     double dfVal2 = 0.0;
    3254    10088430 :                     double dfVal3 = 0.0;
    3255    10088430 :                     GDALResampleConvolutionHorizontal_3rows(
    3256    10088430 :                         pChunk + j, pChunk + j + nChunkXSize,
    3257    10088430 :                         pChunk + j + 2 * nChunkXSize, padfWeights,
    3258             :                         nSrcPixelCount, dfVal1, dfVal2, dfVal3);
    3259    10088430 :                     padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3260    10088430 :                                                nDstXSize +
    3261    10088430 :                                            iDstPixel - nDstXOff] = dfVal1;
    3262    10088430 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3263    10088430 :                                             1) *
    3264    10088430 :                                                nDstXSize +
    3265    10088430 :                                            iDstPixel - nDstXOff] = dfVal2;
    3266    10088430 :                     padfHorizontalFiltered[(static_cast<size_t>(iSrcLineOff) +
    3267    10088430 :                                             2) *
    3268    10088430 :                                                nDstXSize +
    3269    10088430 :                                            iDstPixel - nDstXOff] = dfVal3;
    3270             :                 }
    3271             :             }
    3272     5457904 :             for (; iSrcLineOff < nHeight; ++iSrcLineOff)
    3273             :             {
    3274     2781344 :                 const GPtrDiff_t j =
    3275     2781344 :                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3276     2781344 :                     (nSrcPixelStart - nChunkXOff);
    3277     5518302 :                 const double dfVal = GDALResampleConvolutionHorizontal(
    3278     2781344 :                     pChunk + j, padfWeights, nSrcPixelCount);
    3279     2781596 :                 padfHorizontalFiltered[static_cast<size_t>(iSrcLineOff) *
    3280     2781596 :                                            nDstXSize +
    3281     2781596 :                                        iDstPixel - nDstXOff] = dfVal;
    3282             :             }
    3283             :         }
    3284             :         else
    3285             :         {
    3286    15629548 :             for (int iSrcLineOff = 0; iSrcLineOff < nHeight; ++iSrcLineOff)
    3287             :             {
    3288    15571750 :                 const GPtrDiff_t j =
    3289    15571750 :                     static_cast<GPtrDiff_t>(iSrcLineOff) * nChunkXSize +
    3290    15571750 :                     (nSrcPixelStart - nChunkXOff);
    3291             : 
    3292    15571750 :                 if (bKernelWithNegativeWeights)
    3293             :                 {
    3294    15091144 :                     int nConsecutiveValid = 0;
    3295    15091144 :                     int nMaxConsecutiveValid = 0;
    3296   130248786 :                     for (int k = 0; k < nSrcPixelCount; k++)
    3297             :                     {
    3298   115157642 :                         if (pabyChunkNodataMask[j + k])
    3299    25502104 :                             nConsecutiveValid++;
    3300    89655138 :                         else if (nConsecutiveValid)
    3301             :                         {
    3302       39842 :                             nMaxConsecutiveValid = std::max(
    3303       39842 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3304       39842 :                             nConsecutiveValid = 0;
    3305             :                         }
    3306             :                     }
    3307    15091144 :                     nMaxConsecutiveValid =
    3308    15091144 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3309    15091144 :                     if (nMaxConsecutiveValid < nSrcPixelCount / 2)
    3310             :                     {
    3311    10989474 :                         const size_t nTempOffset =
    3312    10989474 :                             static_cast<size_t>(iSrcLineOff) * nDstXSize +
    3313    10989474 :                             iDstPixel - nDstXOff;
    3314    10989474 :                         padfHorizontalFiltered[nTempOffset] = 0.0;
    3315    10989474 :                         pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3316    10989474 :                         continue;
    3317             :                     }
    3318             :                 }
    3319             : 
    3320     4582276 :                 double dfVal = 0.0;
    3321     4582276 :                 GDALResampleConvolutionHorizontalWithMask(
    3322     4582276 :                     pChunk + j, pabyChunkNodataMask + j, padfWeights,
    3323             :                     nSrcPixelCount, dfVal, dfWeightSum);
    3324     4579813 :                 const size_t nTempOffset =
    3325     4579813 :                     static_cast<size_t>(iSrcLineOff) * nDstXSize + iDstPixel -
    3326     4579813 :                     nDstXOff;
    3327     4579813 :                 if (dfWeightSum > 0.0)
    3328             :                 {
    3329     4538924 :                     padfHorizontalFiltered[nTempOffset] = dfVal / dfWeightSum;
    3330     4538924 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 1;
    3331             :                 }
    3332             :                 else
    3333             :                 {
    3334       40950 :                     padfHorizontalFiltered[nTempOffset] = 0.0;
    3335       40950 :                     pabyChunkNodataMaskHorizontalFiltered[nTempOffset] = 0;
    3336             :                 }
    3337             :             }
    3338             :         }
    3339             :     }
    3340             : 
    3341             :     /* ==================================================================== */
    3342             :     /*      Second pass: vertical filter                                    */
    3343             :     /* ==================================================================== */
    3344        3606 :     const int nChunkBottomYOff = nChunkYOff + nChunkYSize;
    3345             : 
    3346      192485 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3347             :     {
    3348      188879 :         Twork *const pafDstScanline =
    3349      188879 :             pafWrkScanline ? pafWrkScanline
    3350        8414 :                            : static_cast<Twork *>(pDstBuffer) +
    3351        8414 :                                  (iDstLine - nDstYOff) * nDstXSize;
    3352             : 
    3353      188879 :         const double dfSrcLine =
    3354      188879 :             (iDstLine + 0.5) * dfYRatioDstToSrc + dfSrcYDelta;
    3355      188879 :         int nSrcLineStart =
    3356      188879 :             static_cast<int>(floor(dfSrcLine - dfYScaledRadius + 0.5));
    3357      188879 :         int nSrcLineStop = static_cast<int>(dfSrcLine + dfYScaledRadius + 0.5);
    3358      188879 :         if (nSrcLineStart < nChunkYOff)
    3359        1719 :             nSrcLineStart = nChunkYOff;
    3360      188879 :         if (nSrcLineStop > nChunkBottomYOff)
    3361        1744 :             nSrcLineStop = nChunkBottomYOff;
    3362             : #if 0
    3363             :         if( nSrcLineStart < nChunkYOff &&
    3364             :             nChunkYOff > 0 )
    3365             :         {
    3366             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3367             :         }
    3368             :         if( nSrcLineStop > nChunkBottomYOff && nChunkBottomYOff < nSrcHeight )
    3369             :         {
    3370             :             printf( "truncated iDstLine = %d\n", iDstLine );/*ok*/
    3371             :         }
    3372             : #endif
    3373      188879 :         const int nSrcLineCount = nSrcLineStop - nSrcLineStart;
    3374      188879 :         double dfWeightSum = 0.0;
    3375             : 
    3376             :         // Compute convolution coefficients.
    3377      188879 :         int nSrcLine = nSrcLineStart;  // Used after for.
    3378      188879 :         double dfY = dfYScaleWeight * (nSrcLine - dfSrcLine + 0.5);
    3379      414441 :         for (; nSrcLine + 3 < nSrcLineStop;
    3380      225562 :              nSrcLine += 4, dfY += 4 * dfYScaleWeight)
    3381             :         {
    3382      225566 :             padfWeights[nSrcLine - nSrcLineStart] = dfY;
    3383      225566 :             padfWeights[nSrcLine + 1 - nSrcLineStart] = dfY + dfYScaleWeight;
    3384      225566 :             padfWeights[nSrcLine + 2 - nSrcLineStart] =
    3385      225566 :                 dfY + 2 * dfYScaleWeight;
    3386      225566 :             padfWeights[nSrcLine + 3 - nSrcLineStart] =
    3387      225566 :                 dfY + 3 * dfYScaleWeight;
    3388      225562 :             dfWeightSum +=
    3389      225566 :                 pfnFilterFunc4Values(padfWeights + nSrcLine - nSrcLineStart);
    3390             :         }
    3391      220347 :         for (; nSrcLine < nSrcLineStop; ++nSrcLine, dfY += dfYScaleWeight)
    3392             :         {
    3393       31475 :             const double dfWeight = pfnFilterFunc(dfY);
    3394       31472 :             padfWeights[nSrcLine - nSrcLineStart] = dfWeight;
    3395       31472 :             dfWeightSum += dfWeight;
    3396             :         }
    3397             : 
    3398      188872 :         if (pabyChunkNodataMask == nullptr)
    3399             :         {
    3400      158693 :             if (dfWeightSum != 0)
    3401             :             {
    3402      158691 :                 const double dfInvWeightSum = 1.0 / dfWeightSum;
    3403      897624 :                 for (int i = 0; i < nSrcLineCount; ++i)
    3404      738933 :                     padfWeights[i] *= dfInvWeightSum;
    3405             :             }
    3406             :         }
    3407             : 
    3408      188872 :         if (pabyChunkNodataMask == nullptr)
    3409             :         {
    3410      158698 :             int iFilteredPixelOff = 0;  // Used after for.
    3411             :             // j used after for.
    3412      158698 :             size_t j =
    3413      158698 :                 (nSrcLineStart - nChunkYOff) * static_cast<size_t>(nDstXSize);
    3414             : #ifdef USE_SSE2
    3415             :             if constexpr (eWrkDataType == GDT_Float32)
    3416             :             {
    3417             : #ifdef __AVX__
    3418             :                 for (; iFilteredPixelOff + 15 < nDstXSize;
    3419             :                      iFilteredPixelOff += 16, j += 16)
    3420             :                 {
    3421             :                     GDALResampleConvolutionVertical_16cols(
    3422             :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3423             :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3424             :                     if (bHasNoData)
    3425             :                     {
    3426             :                         for (int k = 0; k < 16; k++)
    3427             :                         {
    3428             :                             pafDstScanline[iFilteredPixelOff + k] =
    3429             :                                 replaceValIfNodata(
    3430             :                                     pafDstScanline[iFilteredPixelOff + k]);
    3431             :                         }
    3432             :                     }
    3433             :                 }
    3434             : #else
    3435    18719887 :                 for (; iFilteredPixelOff + 7 < nDstXSize;
    3436             :                      iFilteredPixelOff += 8, j += 8)
    3437             :                 {
    3438    18615900 :                     GDALResampleConvolutionVertical_8cols(
    3439    18615900 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3440    18615900 :                         nSrcLineCount, pafDstScanline + iFilteredPixelOff);
    3441    18568400 :                     if (bHasNoData)
    3442             :                     {
    3443       17820 :                         for (int k = 0; k < 8; k++)
    3444             :                         {
    3445       15840 :                             pafDstScanline[iFilteredPixelOff + k] =
    3446       15840 :                                 replaceValIfNodata(
    3447       15840 :                                     pafDstScanline[iFilteredPixelOff + k]);
    3448             :                         }
    3449             :                     }
    3450             :                 }
    3451             : #endif
    3452             : 
    3453      566658 :                 for (; iFilteredPixelOff < nDstXSize; iFilteredPixelOff++, j++)
    3454             :                 {
    3455      462717 :                     const Twork fVal =
    3456      462639 :                         static_cast<Twork>(GDALResampleConvolutionVertical(
    3457      462639 :                             padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3458             :                             nSrcLineCount));
    3459      462655 :                     pafDstScanline[iFilteredPixelOff] =
    3460      462717 :                         replaceValIfNodata(fVal);
    3461             :                 }
    3462             :             }
    3463             :             else
    3464             : #endif
    3465             :             {
    3466     2887210 :                 for (; iFilteredPixelOff + 1 < nDstXSize;
    3467             :                      iFilteredPixelOff += 2, j += 2)
    3468             :                 {
    3469     2880000 :                     double dfVal1 = 0.0;
    3470     2880000 :                     double dfVal2 = 0.0;
    3471     2880000 :                     GDALResampleConvolutionVertical_2cols(
    3472     2880000 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3473             :                         nSrcLineCount, dfVal1, dfVal2);
    3474     5760010 :                     pafDstScanline[iFilteredPixelOff] =
    3475     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal1));
    3476     2880000 :                     pafDstScanline[iFilteredPixelOff + 1] =
    3477     2880000 :                         replaceValIfNodata(static_cast<Twork>(dfVal2));
    3478             :                 }
    3479        7204 :                 if (iFilteredPixelOff < nDstXSize)
    3480             :                 {
    3481           0 :                     const double dfVal = GDALResampleConvolutionVertical(
    3482           0 :                         padfHorizontalFiltered + j, nDstXSize, padfWeights,
    3483             :                         nSrcLineCount);
    3484           0 :                     pafDstScanline[iFilteredPixelOff] =
    3485           0 :                         replaceValIfNodata(static_cast<Twork>(dfVal));
    3486             :                 }
    3487             :             }
    3488             :         }
    3489             :         else
    3490             :         {
    3491    16007350 :             for (int iFilteredPixelOff = 0; iFilteredPixelOff < nDstXSize;
    3492             :                  ++iFilteredPixelOff)
    3493             :             {
    3494    15977137 :                 double dfVal = 0.0;
    3495    15977137 :                 dfWeightSum = 0.0;
    3496    15977137 :                 size_t j = (nSrcLineStart - nChunkYOff) *
    3497    15977137 :                                static_cast<size_t>(nDstXSize) +
    3498    15977137 :                            iFilteredPixelOff;
    3499    15977137 :                 if (bKernelWithNegativeWeights)
    3500             :                 {
    3501    15752333 :                     int nConsecutiveValid = 0;
    3502    15752333 :                     int nMaxConsecutiveValid = 0;
    3503   104744705 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3504             :                     {
    3505    88992272 :                         const double dfWeight =
    3506    88992272 :                             padfWeights[i] *
    3507             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3508    88992272 :                         if (pabyChunkNodataMaskHorizontalFiltered[j])
    3509             :                         {
    3510    34763295 :                             nConsecutiveValid++;
    3511             :                         }
    3512    54228977 :                         else if (nConsecutiveValid)
    3513             :                         {
    3514      172294 :                             nMaxConsecutiveValid = std::max(
    3515      172294 :                                 nMaxConsecutiveValid, nConsecutiveValid);
    3516      172294 :                             nConsecutiveValid = 0;
    3517             :                         }
    3518    88992272 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3519    88992272 :                         dfWeightSum += dfWeight;
    3520             :                     }
    3521    15752333 :                     nMaxConsecutiveValid =
    3522    15752333 :                         std::max(nMaxConsecutiveValid, nConsecutiveValid);
    3523    15752333 :                     if (nMaxConsecutiveValid < nSrcLineCount / 2)
    3524             :                     {
    3525     8116035 :                         pafDstScanline[iFilteredPixelOff] =
    3526     8116033 :                             static_cast<Twork>(dfNoDataValue);
    3527     8116035 :                         continue;
    3528             :                     }
    3529             :                 }
    3530             :                 else
    3531             :                 {
    3532     1130262 :                     for (int i = 0; i < nSrcLineCount; ++i, j += nDstXSize)
    3533             :                     {
    3534      905432 :                         const double dfWeight =
    3535      905432 :                             padfWeights[i] *
    3536             :                             pabyChunkNodataMaskHorizontalFiltered[j];
    3537      905432 :                         dfVal += padfHorizontalFiltered[j] * dfWeight;
    3538      905432 :                         dfWeightSum += dfWeight;
    3539             :                     }
    3540             :                 }
    3541     7861102 :                 if (dfWeightSum > 0.0)
    3542             :                 {
    3543     7845089 :                     pafDstScanline[iFilteredPixelOff] = replaceValIfNodata(
    3544     7845087 :                         static_cast<Twork>(dfVal / dfWeightSum));
    3545             :                 }
    3546             :                 else
    3547             :                 {
    3548       16011 :                     pafDstScanline[iFilteredPixelOff] =
    3549       16007 :                         static_cast<Twork>(dfNoDataValue);
    3550             :                 }
    3551             :             }
    3552             :         }
    3553             : 
    3554      141397 :         if (fMaxVal != 0.0f)
    3555             :         {
    3556      192324 :             for (int i = 0; i < nDstXSize; ++i)
    3557             :             {
    3558      192088 :                 if (pafDstScanline[i] > fMaxVal)
    3559       96022 :                     pafDstScanline[i] = fMaxVal;
    3560             :             }
    3561             :         }
    3562             : 
    3563      141397 :         if (pafWrkScanline)
    3564             :         {
    3565      180461 :             GDALCopyWords(pafWrkScanline, eWrkDataType, 4,
    3566             :                           static_cast<GByte *>(pDstBuffer) +
    3567      180461 :                               static_cast<size_t>(iDstLine - nDstYOff) *
    3568      180461 :                                   nDstXSize * nDstDataTypeSize,
    3569             :                           dstDataType, nDstDataTypeSize, nDstXSize);
    3570             :         }
    3571             :     }
    3572             : 
    3573        3606 :     VSIFree(pafWrkScanline);
    3574        3606 :     VSIFreeAligned(padfWeights);
    3575        3606 :     VSIFree(padfHorizontalFiltered);
    3576        3606 :     VSIFree(pabyChunkNodataMaskHorizontalFiltered);
    3577             : 
    3578        3606 :     return CE_None;
    3579             : }
    3580             : 
    3581        3605 : static CPLErr GDALResampleChunk_Convolution(
    3582             :     double dfXRatioDstToSrc, double dfYRatioDstToSrc, double dfSrcXDelta,
    3583             :     double dfSrcYDelta, GDALDataType eWrkDataType, const void *pChunk,
    3584             :     const GByte *pabyChunkNodataMask, int nChunkXOff, int nChunkXSize,
    3585             :     int nChunkYOff, int nChunkYSize, int nDstXOff, int nDstXOff2, int nDstYOff,
    3586             :     int nDstYOff2, GDALRasterBand *poOverview, void **ppDstBuffer,
    3587             :     GDALDataType *peDstBufferDataType, const char *pszResampling,
    3588             :     bool bHasNoData, double dfNoDataValue,
    3589             :     GDALColorTable * /* poColorTable_unused */, GDALDataType /* eSrcDataType */,
    3590             :     bool /* bPropagateNoData */)
    3591             : {
    3592             :     GDALResampleAlg eResample;
    3593        3605 :     bool bKernelWithNegativeWeights = false;
    3594        3605 :     if (EQUAL(pszResampling, "BILINEAR"))
    3595        2569 :         eResample = GRA_Bilinear;
    3596        1036 :     else if (EQUAL(pszResampling, "CUBIC"))
    3597             :     {
    3598         981 :         eResample = GRA_Cubic;
    3599         981 :         bKernelWithNegativeWeights = true;
    3600             :     }
    3601          55 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    3602          23 :         eResample = GRA_CubicSpline;
    3603          32 :     else if (EQUAL(pszResampling, "LANCZOS"))
    3604             :     {
    3605          26 :         eResample = GRA_Lanczos;
    3606          26 :         bKernelWithNegativeWeights = true;
    3607             :     }
    3608             :     else
    3609             :     {
    3610           6 :         CPLAssert(false);
    3611             :         return CE_Failure;
    3612             :     }
    3613        3599 :     const int nKernelRadius = GWKGetFilterRadius(eResample);
    3614        3599 :     FilterFuncType pfnFilterFunc = GWKGetFilterFunc(eResample);
    3615             :     const FilterFunc4ValuesType pfnFilterFunc4Values =
    3616        3602 :         GWKGetFilterFunc4Values(eResample);
    3617             : 
    3618        3599 :     float fMaxVal = 0.f;
    3619             :     // Cubic, etc... can have overshoots, so make sure we clamp values to the
    3620             :     // maximum value if NBITS is set.
    3621             :     const char *pszNBITS =
    3622        3599 :         poOverview->GetMetadataItem("NBITS", "IMAGE_STRUCTURE");
    3623        3600 :     GDALDataType eBandDT = poOverview->GetRasterDataType();
    3624        3602 :     if (eResample != GRA_Bilinear && pszNBITS != nullptr &&
    3625           2 :         (eBandDT == GDT_Byte || eBandDT == GDT_UInt16 || eBandDT == GDT_UInt32))
    3626             :     {
    3627           8 :         int nBits = atoi(pszNBITS);
    3628           8 :         if (nBits == GDALGetDataTypeSize(eBandDT))
    3629           1 :             nBits = 0;
    3630           8 :         if (nBits > 0 && nBits < 32)
    3631           7 :             fMaxVal = static_cast<float>((1U << nBits) - 1);
    3632             :     }
    3633             : 
    3634        3603 :     *ppDstBuffer =
    3635        3602 :         VSI_MALLOC3_VERBOSE(nDstXOff2 - nDstXOff, nDstYOff2 - nDstYOff,
    3636             :                             GDALGetDataTypeSizeBytes(eBandDT));
    3637        3603 :     if (*ppDstBuffer == nullptr)
    3638             :     {
    3639           0 :         return CE_Failure;
    3640             :     }
    3641        3603 :     *peDstBufferDataType = eBandDT;
    3642             : 
    3643        3603 :     if (eWrkDataType == GDT_Byte)
    3644        2888 :         return GDALResampleChunk_ConvolutionT<GByte, float, GDT_Float32>(
    3645             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    3646             :             static_cast<const GByte *>(pChunk), 1, pabyChunkNodataMask,
    3647             :             nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    3648             :             nDstXOff2, nDstYOff, nDstYOff2, poOverview, *ppDstBuffer,
    3649             :             bHasNoData, dfNoDataValue, pfnFilterFunc, pfnFilterFunc4Values,
    3650        2888 :             nKernelRadius, bKernelWithNegativeWeights, fMaxVal);
    3651         715 :     else if (eWrkDataType == GDT_UInt16)
    3652         394 :         return GDALResampleChunk_ConvolutionT<GUInt16, float, GDT_Float32>(
    3653             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    3654             :             static_cast<const GUInt16 *>(pChunk), 1, pabyChunkNodataMask,
    3655             :             nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    3656             :             nDstXOff2, nDstYOff, nDstYOff2, poOverview, *ppDstBuffer,
    3657             :             bHasNoData, dfNoDataValue, pfnFilterFunc, pfnFilterFunc4Values,
    3658         396 :             nKernelRadius, bKernelWithNegativeWeights, fMaxVal);
    3659         321 :     else if (eWrkDataType == GDT_Float32)
    3660         300 :         return GDALResampleChunk_ConvolutionT<float, float, GDT_Float32>(
    3661             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    3662             :             static_cast<const float *>(pChunk), 1, pabyChunkNodataMask,
    3663             :             nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    3664             :             nDstXOff2, nDstYOff, nDstYOff2, poOverview, *ppDstBuffer,
    3665             :             bHasNoData, dfNoDataValue, pfnFilterFunc, pfnFilterFunc4Values,
    3666         300 :             nKernelRadius, bKernelWithNegativeWeights, fMaxVal);
    3667          21 :     else if (eWrkDataType == GDT_Float64)
    3668          22 :         return GDALResampleChunk_ConvolutionT<double, double, GDT_Float64>(
    3669             :             dfXRatioDstToSrc, dfYRatioDstToSrc, dfSrcXDelta, dfSrcYDelta,
    3670             :             static_cast<const double *>(pChunk), 1, pabyChunkNodataMask,
    3671             :             nChunkXOff, nChunkXSize, nChunkYOff, nChunkYSize, nDstXOff,
    3672             :             nDstXOff2, nDstYOff, nDstYOff2, poOverview, *ppDstBuffer,
    3673             :             bHasNoData, dfNoDataValue, pfnFilterFunc, pfnFilterFunc4Values,
    3674          22 :             nKernelRadius, bKernelWithNegativeWeights, fMaxVal);
    3675             : 
    3676           0 :     CPLAssert(false);
    3677             :     return CE_Failure;
    3678             : }
    3679             : 
    3680             : /************************************************************************/
    3681             : /*                       GDALResampleChunkC32R()                        */
    3682             : /************************************************************************/
    3683             : 
    3684          10 : static CPLErr GDALResampleChunkC32R(int nSrcWidth, int nSrcHeight,
    3685             :                                     const float *pafChunk, int nChunkYOff,
    3686             :                                     int nChunkYSize, int nDstYOff,
    3687             :                                     int nDstYOff2, GDALRasterBand *poOverview,
    3688             :                                     void **ppDstBuffer,
    3689             :                                     GDALDataType *peDstBufferDataType,
    3690             :                                     const char *pszResampling)
    3691             : 
    3692             : {
    3693             :     enum Method
    3694             :     {
    3695             :         NEAR,
    3696             :         AVERAGE,
    3697             :         AVERAGE_MAGPHASE,
    3698             :         RMS,
    3699             :     };
    3700             : 
    3701          10 :     Method eMethod = NEAR;
    3702          10 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    3703             :     {
    3704           8 :         eMethod = NEAR;
    3705             :     }
    3706           2 :     else if (EQUAL(pszResampling, "AVERAGE_MAGPHASE"))
    3707             :     {
    3708           0 :         eMethod = AVERAGE_MAGPHASE;
    3709             :     }
    3710           2 :     else if (EQUAL(pszResampling, "RMS"))
    3711             :     {
    3712           2 :         eMethod = RMS;
    3713             :     }
    3714           0 :     else if (STARTS_WITH_CI(pszResampling, "AVER"))
    3715             :     {
    3716           0 :         eMethod = AVERAGE;
    3717             :     }
    3718             :     else
    3719             :     {
    3720           0 :         CPLError(
    3721             :             CE_Failure, CPLE_NotSupported,
    3722             :             "Resampling method %s is not supported for complex data types. "
    3723             :             "Only NEAREST, AVERAGE, AVERAGE_MAGPHASE and RMS are supported",
    3724             :             pszResampling);
    3725           0 :         return CE_Failure;
    3726             :     }
    3727             : 
    3728          10 :     const int nOXSize = poOverview->GetXSize();
    3729          10 :     *ppDstBuffer = VSI_MALLOC3_VERBOSE(nOXSize, nDstYOff2 - nDstYOff,
    3730             :                                        GDALGetDataTypeSizeBytes(GDT_CFloat32));
    3731          10 :     if (*ppDstBuffer == nullptr)
    3732             :     {
    3733           0 :         return CE_Failure;
    3734             :     }
    3735          10 :     float *const pafDstBuffer = static_cast<float *>(*ppDstBuffer);
    3736          10 :     *peDstBufferDataType = GDT_CFloat32;
    3737             : 
    3738          10 :     const int nOYSize = poOverview->GetYSize();
    3739          10 :     const double dfXRatioDstToSrc = static_cast<double>(nSrcWidth) / nOXSize;
    3740          10 :     const double dfYRatioDstToSrc = static_cast<double>(nSrcHeight) / nOYSize;
    3741             : 
    3742             :     /* ==================================================================== */
    3743             :     /*      Loop over destination scanlines.                                */
    3744             :     /* ==================================================================== */
    3745          96 :     for (int iDstLine = nDstYOff; iDstLine < nDstYOff2; ++iDstLine)
    3746             :     {
    3747          86 :         int nSrcYOff = static_cast<int>(0.5 + iDstLine * dfYRatioDstToSrc);
    3748          86 :         if (nSrcYOff < nChunkYOff)
    3749           0 :             nSrcYOff = nChunkYOff;
    3750             : 
    3751          86 :         int nSrcYOff2 =
    3752          86 :             static_cast<int>(0.5 + (iDstLine + 1) * dfYRatioDstToSrc);
    3753          86 :         if (nSrcYOff2 == nSrcYOff)
    3754           0 :             nSrcYOff2++;
    3755             : 
    3756          86 :         if (nSrcYOff2 > nSrcHeight || iDstLine == nOYSize - 1)
    3757             :         {
    3758          10 :             if (nSrcYOff == nSrcHeight && nSrcHeight - 1 >= nChunkYOff)
    3759           0 :                 nSrcYOff = nSrcHeight - 1;
    3760          10 :             nSrcYOff2 = nSrcHeight;
    3761             :         }
    3762          86 :         if (nSrcYOff2 > nChunkYOff + nChunkYSize)
    3763           0 :             nSrcYOff2 = nChunkYOff + nChunkYSize;
    3764             : 
    3765          86 :         const float *const pafSrcScanline =
    3766          86 :             pafChunk + ((nSrcYOff - nChunkYOff) * nSrcWidth) * 2;
    3767          86 :         float *const pafDstScanline =
    3768          86 :             pafDstBuffer + (iDstLine - nDstYOff) * 2 * nOXSize;
    3769             : 
    3770             :         /* --------------------------------------------------------------------
    3771             :          */
    3772             :         /*      Loop over destination pixels */
    3773             :         /* --------------------------------------------------------------------
    3774             :          */
    3775         898 :         for (int iDstPixel = 0; iDstPixel < nOXSize; ++iDstPixel)
    3776             :         {
    3777         812 :             int nSrcXOff = static_cast<int>(0.5 + iDstPixel * dfXRatioDstToSrc);
    3778         812 :             int nSrcXOff2 =
    3779         812 :                 static_cast<int>(0.5 + (iDstPixel + 1) * dfXRatioDstToSrc);
    3780         812 :             if (nSrcXOff2 == nSrcXOff)
    3781           0 :                 nSrcXOff2++;
    3782         812 :             if (nSrcXOff2 > nSrcWidth || iDstPixel == nOXSize - 1)
    3783             :             {
    3784          86 :                 if (nSrcXOff == nSrcWidth && nSrcWidth - 1 >= 0)
    3785           0 :                     nSrcXOff = nSrcWidth - 1;
    3786          86 :                 nSrcXOff2 = nSrcWidth;
    3787             :             }
    3788             : 
    3789         812 :             if (eMethod == NEAR)
    3790             :             {
    3791         800 :                 pafDstScanline[iDstPixel * 2] = pafSrcScanline[nSrcXOff * 2];
    3792         800 :                 pafDstScanline[iDstPixel * 2 + 1] =
    3793         800 :                     pafSrcScanline[nSrcXOff * 2 + 1];
    3794             :             }
    3795          12 :             else if (eMethod == AVERAGE_MAGPHASE)
    3796             :             {
    3797           0 :                 double dfTotalR = 0.0;
    3798           0 :                 double dfTotalI = 0.0;
    3799           0 :                 double dfTotalM = 0.0;
    3800           0 :                 int nCount = 0;
    3801             : 
    3802           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    3803             :                 {
    3804           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    3805             :                     {
    3806           0 :                         const double dfR =
    3807           0 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    3808           0 :                                                         iY - nSrcYOff) *
    3809           0 :                                                         nSrcWidth * 2];
    3810           0 :                         const double dfI =
    3811           0 :                             pafSrcScanline[iX * 2 +
    3812           0 :                                            static_cast<GPtrDiff_t>(iY -
    3813           0 :                                                                    nSrcYOff) *
    3814           0 :                                                nSrcWidth * 2 +
    3815           0 :                                            1];
    3816           0 :                         dfTotalR += dfR;
    3817           0 :                         dfTotalI += dfI;
    3818           0 :                         dfTotalM += std::hypot(dfR, dfI);
    3819           0 :                         ++nCount;
    3820             :                     }
    3821             :                 }
    3822             : 
    3823           0 :                 CPLAssert(nCount > 0);
    3824           0 :                 if (nCount == 0)
    3825             :                 {
    3826           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    3827           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    3828             :                 }
    3829             :                 else
    3830             :                 {
    3831           0 :                     pafDstScanline[iDstPixel * 2] =
    3832           0 :                         static_cast<float>(dfTotalR / nCount);
    3833           0 :                     pafDstScanline[iDstPixel * 2 + 1] =
    3834           0 :                         static_cast<float>(dfTotalI / nCount);
    3835             :                     const double dfM =
    3836           0 :                         std::hypot(pafDstScanline[iDstPixel * 2],
    3837           0 :                                    pafDstScanline[iDstPixel * 2 + 1]);
    3838           0 :                     const double dfDesiredM = dfTotalM / nCount;
    3839           0 :                     double dfRatio = 1.0;
    3840           0 :                     if (dfM != 0.0)
    3841           0 :                         dfRatio = dfDesiredM / dfM;
    3842             : 
    3843           0 :                     pafDstScanline[iDstPixel * 2] *=
    3844           0 :                         static_cast<float>(dfRatio);
    3845           0 :                     pafDstScanline[iDstPixel * 2 + 1] *=
    3846           0 :                         static_cast<float>(dfRatio);
    3847             :                 }
    3848             :             }
    3849          12 :             else if (eMethod == RMS)
    3850             :             {
    3851          12 :                 double dfTotalR = 0.0;
    3852          12 :                 double dfTotalI = 0.0;
    3853          12 :                 int nCount = 0;
    3854             : 
    3855          36 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    3856             :                 {
    3857          72 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    3858             :                     {
    3859          48 :                         const double dfR =
    3860          48 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    3861          48 :                                                         iY - nSrcYOff) *
    3862          48 :                                                         nSrcWidth * 2];
    3863          48 :                         const double dfI =
    3864          48 :                             pafSrcScanline[iX * 2 +
    3865          48 :                                            static_cast<GPtrDiff_t>(iY -
    3866          48 :                                                                    nSrcYOff) *
    3867          48 :                                                nSrcWidth * 2 +
    3868          48 :                                            1];
    3869             : 
    3870          48 :                         dfTotalR += SQUARE(dfR);
    3871          48 :                         dfTotalI += SQUARE(dfI);
    3872             : 
    3873          48 :                         ++nCount;
    3874             :                     }
    3875             :                 }
    3876             : 
    3877          12 :                 CPLAssert(nCount > 0);
    3878          12 :                 if (nCount == 0)
    3879             :                 {
    3880           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    3881           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    3882             :                 }
    3883             :                 else
    3884             :                 {
    3885             :                     /* compute RMS */
    3886          12 :                     pafDstScanline[iDstPixel * 2] =
    3887          12 :                         static_cast<float>(sqrt(dfTotalR / nCount));
    3888          12 :                     pafDstScanline[iDstPixel * 2 + 1] =
    3889          12 :                         static_cast<float>(sqrt(dfTotalI / nCount));
    3890             :                 }
    3891             :             }
    3892           0 :             else if (eMethod == AVERAGE)
    3893             :             {
    3894           0 :                 double dfTotalR = 0.0;
    3895           0 :                 double dfTotalI = 0.0;
    3896           0 :                 int nCount = 0;
    3897             : 
    3898           0 :                 for (int iY = nSrcYOff; iY < nSrcYOff2; ++iY)
    3899             :                 {
    3900           0 :                     for (int iX = nSrcXOff; iX < nSrcXOff2; ++iX)
    3901             :                     {
    3902             :                         // TODO(schwehr): Maybe use std::complex?
    3903           0 :                         dfTotalR +=
    3904           0 :                             pafSrcScanline[iX * 2 + static_cast<GPtrDiff_t>(
    3905           0 :                                                         iY - nSrcYOff) *
    3906           0 :                                                         nSrcWidth * 2];
    3907           0 :                         dfTotalI += pafSrcScanline[iX * 2 +
    3908           0 :                                                    static_cast<GPtrDiff_t>(
    3909           0 :                                                        iY - nSrcYOff) *
    3910           0 :                                                        nSrcWidth * 2 +
    3911           0 :                                                    1];
    3912           0 :                         ++nCount;
    3913             :                     }
    3914             :                 }
    3915             : 
    3916           0 :                 CPLAssert(nCount > 0);
    3917           0 :                 if (nCount == 0)
    3918             :                 {
    3919           0 :                     pafDstScanline[iDstPixel * 2] = 0.0;
    3920           0 :                     pafDstScanline[iDstPixel * 2 + 1] = 0.0;
    3921             :                 }
    3922             :                 else
    3923             :                 {
    3924           0 :                     pafDstScanline[iDstPixel * 2] =
    3925           0 :                         static_cast<float>(dfTotalR / nCount);
    3926           0 :                     pafDstScanline[iDstPixel * 2 + 1] =
    3927           0 :                         static_cast<float>(dfTotalI / nCount);
    3928             :                 }
    3929             :             }
    3930             :         }
    3931             :     }
    3932             : 
    3933          10 :     return CE_None;
    3934             : }
    3935             : 
    3936             : /************************************************************************/
    3937             : /*                  GDALRegenerateCascadingOverviews()                  */
    3938             : /*                                                                      */
    3939             : /*      Generate a list of overviews in order from largest to           */
    3940             : /*      smallest, computing each from the next larger.                  */
    3941             : /************************************************************************/
    3942             : 
    3943          42 : static CPLErr GDALRegenerateCascadingOverviews(
    3944             :     GDALRasterBand *poSrcBand, int nOverviews, GDALRasterBand **papoOvrBands,
    3945             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    3946             :     void *pProgressData, CSLConstList papszOptions)
    3947             : 
    3948             : {
    3949             :     /* -------------------------------------------------------------------- */
    3950             :     /*      First, we must put the overviews in order from largest to       */
    3951             :     /*      smallest.                                                       */
    3952             :     /* -------------------------------------------------------------------- */
    3953         120 :     for (int i = 0; i < nOverviews - 1; ++i)
    3954             :     {
    3955         270 :         for (int j = 0; j < nOverviews - i - 1; ++j)
    3956             :         {
    3957         192 :             if (papoOvrBands[j]->GetXSize() *
    3958         192 :                     static_cast<float>(papoOvrBands[j]->GetYSize()) <
    3959         192 :                 papoOvrBands[j + 1]->GetXSize() *
    3960         192 :                     static_cast<float>(papoOvrBands[j + 1]->GetYSize()))
    3961             :             {
    3962           0 :                 GDALRasterBand *poTempBand = papoOvrBands[j];
    3963           0 :                 papoOvrBands[j] = papoOvrBands[j + 1];
    3964           0 :                 papoOvrBands[j + 1] = poTempBand;
    3965             :             }
    3966             :         }
    3967             :     }
    3968             : 
    3969             :     /* -------------------------------------------------------------------- */
    3970             :     /*      Count total pixels so we can prepare appropriate scaled         */
    3971             :     /*      progress functions.                                             */
    3972             :     /* -------------------------------------------------------------------- */
    3973          42 :     double dfTotalPixels = 0.0;
    3974             : 
    3975         162 :     for (int i = 0; i < nOverviews; ++i)
    3976             :     {
    3977         120 :         dfTotalPixels += papoOvrBands[i]->GetXSize() *
    3978         120 :                          static_cast<double>(papoOvrBands[i]->GetYSize());
    3979             :     }
    3980             : 
    3981             :     /* -------------------------------------------------------------------- */
    3982             :     /*      Generate all the bands.                                         */
    3983             :     /* -------------------------------------------------------------------- */
    3984          42 :     double dfPixelsProcessed = 0.0;
    3985             : 
    3986         162 :     for (int i = 0; i < nOverviews; ++i)
    3987             :     {
    3988         120 :         GDALRasterBand *poBaseBand = poSrcBand;
    3989         120 :         if (i != 0)
    3990          78 :             poBaseBand = papoOvrBands[i - 1];
    3991             : 
    3992         120 :         double dfPixels = papoOvrBands[i]->GetXSize() *
    3993         120 :                           static_cast<double>(papoOvrBands[i]->GetYSize());
    3994             : 
    3995         240 :         void *pScaledProgressData = GDALCreateScaledProgress(
    3996             :             dfPixelsProcessed / dfTotalPixels,
    3997         120 :             (dfPixelsProcessed + dfPixels) / dfTotalPixels, pfnProgress,
    3998             :             pProgressData);
    3999             : 
    4000         240 :         const CPLErr eErr = GDALRegenerateOverviewsEx(
    4001             :             poBaseBand, 1,
    4002         120 :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands) + i,
    4003             :             pszResampling, GDALScaledProgress, pScaledProgressData,
    4004             :             papszOptions);
    4005         120 :         GDALDestroyScaledProgress(pScaledProgressData);
    4006             : 
    4007         120 :         if (eErr != CE_None)
    4008           0 :             return eErr;
    4009             : 
    4010         120 :         dfPixelsProcessed += dfPixels;
    4011             : 
    4012             :         // Only do the bit2grayscale promotion on the base band.
    4013         120 :         if (STARTS_WITH_CI(pszResampling,
    4014             :                            "AVERAGE_BIT2G" /* AVERAGE_BIT2GRAYSCALE */))
    4015           8 :             pszResampling = "AVERAGE";
    4016             :     }
    4017             : 
    4018          42 :     return CE_None;
    4019             : }
    4020             : 
    4021             : /************************************************************************/
    4022             : /*                    GDALGetResampleFunction()                         */
    4023             : /************************************************************************/
    4024             : 
    4025        3688 : GDALResampleFunction GDALGetResampleFunction(const char *pszResampling,
    4026             :                                              int *pnRadius)
    4027             : {
    4028        3688 :     if (pnRadius)
    4029        3689 :         *pnRadius = 0;
    4030        3688 :     if (STARTS_WITH_CI(pszResampling, "NEAR"))
    4031         365 :         return GDALResampleChunk_Near;
    4032        3323 :     else if (STARTS_WITH_CI(pszResampling, "AVER") ||
    4033        2812 :              EQUAL(pszResampling, "RMS"))
    4034         535 :         return GDALResampleChunk_AverageOrRMS;
    4035        2788 :     else if (EQUAL(pszResampling, "GAUSS"))
    4036             :     {
    4037          26 :         if (pnRadius)
    4038          26 :             *pnRadius = 1;
    4039          26 :         return GDALResampleChunk_Gauss;
    4040             :     }
    4041        2762 :     else if (EQUAL(pszResampling, "MODE"))
    4042          40 :         return GDALResampleChunk_Mode;
    4043        2722 :     else if (EQUAL(pszResampling, "CUBIC"))
    4044             :     {
    4045         363 :         if (pnRadius)
    4046         363 :             *pnRadius = GWKGetFilterRadius(GRA_Cubic);
    4047         367 :         return GDALResampleChunk_Convolution;
    4048             :     }
    4049        2359 :     else if (EQUAL(pszResampling, "CUBICSPLINE"))
    4050             :     {
    4051           3 :         if (pnRadius)
    4052           3 :             *pnRadius = GWKGetFilterRadius(GRA_CubicSpline);
    4053           3 :         return GDALResampleChunk_Convolution;
    4054             :     }
    4055        2356 :     else if (EQUAL(pszResampling, "LANCZOS"))
    4056             :     {
    4057           6 :         if (pnRadius)
    4058           6 :             *pnRadius = GWKGetFilterRadius(GRA_Lanczos);
    4059           6 :         return GDALResampleChunk_Convolution;
    4060             :     }
    4061        2350 :     else if (EQUAL(pszResampling, "BILINEAR"))
    4062             :     {
    4063        2347 :         if (pnRadius)
    4064        2347 :             *pnRadius = GWKGetFilterRadius(GRA_Bilinear);
    4065        2347 :         return GDALResampleChunk_Convolution;
    4066             :     }
    4067             :     else
    4068             :     {
    4069           3 :         CPLError(
    4070             :             CE_Failure, CPLE_AppDefined,
    4071             :             "GDALGetResampleFunction: Unsupported resampling method \"%s\".",
    4072             :             pszResampling);
    4073           0 :         return nullptr;
    4074             :     }
    4075             : }
    4076             : 
    4077             : /************************************************************************/
    4078             : /*                      GDALGetOvrWorkDataType()                        */
    4079             : /************************************************************************/
    4080             : 
    4081        3567 : GDALDataType GDALGetOvrWorkDataType(const char *pszResampling,
    4082             :                                     GDALDataType eSrcDataType)
    4083             : {
    4084        3567 :     if ((STARTS_WITH_CI(pszResampling, "NEAR") ||
    4085        3216 :          STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4086        2709 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    4087        2405 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    4088        3567 :          EQUAL(pszResampling, "MODE")) &&
    4089             :         eSrcDataType == GDT_Byte)
    4090             :     {
    4091        3191 :         return GDT_Byte;
    4092             :     }
    4093         376 :     else if ((STARTS_WITH_CI(pszResampling, "NEAR") ||
    4094         330 :               STARTS_WITH_CI(pszResampling, "AVER") ||
    4095         285 :               EQUAL(pszResampling, "RMS") || EQUAL(pszResampling, "CUBIC") ||
    4096         166 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4097         166 :               EQUAL(pszResampling, "LANCZOS") ||
    4098         163 :               EQUAL(pszResampling, "BILINEAR") ||
    4099         376 :               EQUAL(pszResampling, "MODE")) &&
    4100             :              eSrcDataType == GDT_UInt16)
    4101             :     {
    4102         104 :         return GDT_UInt16;
    4103             :     }
    4104         272 :     else if (EQUAL(pszResampling, "GAUSS"))
    4105          20 :         return GDT_Float64;
    4106             : 
    4107         252 :     if (eSrcDataType == GDT_Float64)
    4108          34 :         return GDT_Float64;
    4109             : 
    4110         218 :     return GDT_Float32;
    4111             : }
    4112             : 
    4113             : namespace
    4114             : {
    4115             : // Structure to hold a pointer to free with CPLFree()
    4116             : struct PointerHolder
    4117             : {
    4118             :     void *ptr = nullptr;
    4119             : 
    4120        5170 :     explicit PointerHolder(void *ptrIn) : ptr(ptrIn)
    4121             :     {
    4122        5170 :     }
    4123             : 
    4124        5170 :     ~PointerHolder()
    4125        5170 :     {
    4126        5170 :         CPLFree(ptr);
    4127        5170 :     }
    4128             : 
    4129             :     PointerHolder(const PointerHolder &) = delete;
    4130             :     PointerHolder &operator=(const PointerHolder &) = delete;
    4131             : };
    4132             : }  // namespace
    4133             : 
    4134             : /************************************************************************/
    4135             : /*                      GDALRegenerateOverviews()                       */
    4136             : /************************************************************************/
    4137             : 
    4138             : /**
    4139             :  * \brief Generate downsampled overviews.
    4140             :  *
    4141             :  * This function will generate one or more overview images from a base image
    4142             :  * using the requested downsampling algorithm.  Its primary use is for
    4143             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4144             :  * used to generate downsampled images in one file from another outside the
    4145             :  * overview architecture.
    4146             :  *
    4147             :  * The output bands need to exist in advance.
    4148             :  *
    4149             :  * The full set of resampling algorithms is documented in
    4150             :  * GDALDataset::BuildOverviews().
    4151             :  *
    4152             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4153             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4154             :  * considered as the nodata value and not each value of the triplet
    4155             :  * independently per band.
    4156             :  *
    4157             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4158             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4159             :  * overview computation.
    4160             :  *
    4161             :  * @param hSrcBand the source (base level) band.
    4162             :  * @param nOverviewCount the number of downsampled bands being generated.
    4163             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4164             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4165             :  * @param pfnProgress progress report function.
    4166             :  * @param pProgressData progress function callback data.
    4167             :  * @return CE_None on success or CE_Failure on failure.
    4168             :  */
    4169         244 : CPLErr GDALRegenerateOverviews(GDALRasterBandH hSrcBand, int nOverviewCount,
    4170             :                                GDALRasterBandH *pahOvrBands,
    4171             :                                const char *pszResampling,
    4172             :                                GDALProgressFunc pfnProgress,
    4173             :                                void *pProgressData)
    4174             : 
    4175             : {
    4176         244 :     return GDALRegenerateOverviewsEx(hSrcBand, nOverviewCount, pahOvrBands,
    4177             :                                      pszResampling, pfnProgress, pProgressData,
    4178         244 :                                      nullptr);
    4179             : }
    4180             : 
    4181             : /************************************************************************/
    4182             : /*                     GDALRegenerateOverviewsEx()                      */
    4183             : /************************************************************************/
    4184             : 
    4185             : /**
    4186             :  * \brief Generate downsampled overviews.
    4187             :  *
    4188             :  * This function will generate one or more overview images from a base image
    4189             :  * using the requested downsampling algorithm.  Its primary use is for
    4190             :  * generating overviews via GDALDataset::BuildOverviews(), but it can also be
    4191             :  * used to generate downsampled images in one file from another outside the
    4192             :  * overview architecture.
    4193             :  *
    4194             :  * The output bands need to exist in advance.
    4195             :  *
    4196             :  * The full set of resampling algorithms is documented in
    4197             :  * GDALDataset::BuildOverviews().
    4198             :  *
    4199             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4200             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4201             :  * considered as the nodata value and not each value of the triplet
    4202             :  * independently per band.
    4203             :  *
    4204             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4205             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4206             :  * overview computation.
    4207             :  *
    4208             :  * @param hSrcBand the source (base level) band.
    4209             :  * @param nOverviewCount the number of downsampled bands being generated.
    4210             :  * @param pahOvrBands the list of downsampled bands to be generated.
    4211             :  * @param pszResampling Resampling algorithm (e.g. "AVERAGE").
    4212             :  * @param pfnProgress progress report function.
    4213             :  * @param pProgressData progress function callback data.
    4214             :  * @param papszOptions NULL terminated list of options as key=value pairs, or
    4215             :  * NULL
    4216             :  * @return CE_None on success or CE_Failure on failure.
    4217             :  * @since GDAL 3.6
    4218             :  */
    4219         729 : CPLErr GDALRegenerateOverviewsEx(GDALRasterBandH hSrcBand, int nOverviewCount,
    4220             :                                  GDALRasterBandH *pahOvrBands,
    4221             :                                  const char *pszResampling,
    4222             :                                  GDALProgressFunc pfnProgress,
    4223             :                                  void *pProgressData, CSLConstList papszOptions)
    4224             : 
    4225             : {
    4226         729 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    4227         729 :     GDALRasterBand **papoOvrBands =
    4228             :         reinterpret_cast<GDALRasterBand **>(pahOvrBands);
    4229             : 
    4230         729 :     if (pfnProgress == nullptr)
    4231         244 :         pfnProgress = GDALDummyProgress;
    4232             : 
    4233         729 :     if (EQUAL(pszResampling, "NONE"))
    4234          61 :         return CE_None;
    4235             : 
    4236         668 :     int nKernelRadius = 0;
    4237             :     GDALResampleFunction pfnResampleFn =
    4238         668 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4239             : 
    4240         668 :     if (pfnResampleFn == nullptr)
    4241           0 :         return CE_Failure;
    4242             : 
    4243             :     /* -------------------------------------------------------------------- */
    4244             :     /*      Check color tables...                                           */
    4245             :     /* -------------------------------------------------------------------- */
    4246         668 :     GDALColorTable *poColorTable = nullptr;
    4247             : 
    4248         318 :     if ((STARTS_WITH_CI(pszResampling, "AVER") || EQUAL(pszResampling, "RMS") ||
    4249        1382 :          EQUAL(pszResampling, "MODE") || EQUAL(pszResampling, "GAUSS")) &&
    4250         407 :         poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4251             :     {
    4252           9 :         poColorTable = poSrcBand->GetColorTable();
    4253           9 :         if (poColorTable != nullptr)
    4254             :         {
    4255           9 :             if (poColorTable->GetPaletteInterpretation() != GPI_RGB)
    4256             :             {
    4257           0 :                 CPLError(CE_Warning, CPLE_AppDefined,
    4258             :                          "Computing overviews on palette index raster bands "
    4259             :                          "with a palette whose color interpretation is not RGB "
    4260             :                          "will probably lead to unexpected results.");
    4261           0 :                 poColorTable = nullptr;
    4262             :             }
    4263           9 :             else if (poColorTable->IsIdentity())
    4264             :             {
    4265           0 :                 poColorTable = nullptr;
    4266             :             }
    4267             :         }
    4268             :         else
    4269             :         {
    4270           0 :             CPLError(CE_Warning, CPLE_AppDefined,
    4271             :                      "Computing overviews on palette index raster bands "
    4272             :                      "without a palette will probably lead to unexpected "
    4273             :                      "results.");
    4274             :         }
    4275             :     }
    4276             :     // Not ready yet
    4277        1923 :     else if ((EQUAL(pszResampling, "CUBIC") ||
    4278         605 :               EQUAL(pszResampling, "CUBICSPLINE") ||
    4279         605 :               EQUAL(pszResampling, "LANCZOS") ||
    4280        1321 :               EQUAL(pszResampling, "BILINEAR")) &&
    4281          57 :              poSrcBand->GetColorInterpretation() == GCI_PaletteIndex)
    4282             :     {
    4283           0 :         CPLError(CE_Warning, CPLE_AppDefined,
    4284             :                  "Computing %s overviews on palette index raster bands "
    4285             :                  "will probably lead to unexpected results.",
    4286             :                  pszResampling);
    4287             :     }
    4288             : 
    4289             :     // If we have a nodata mask and we are doing something more complicated
    4290             :     // than nearest neighbouring, we have to fetch to nodata mask.
    4291             : 
    4292         668 :     GDALRasterBand *poMaskBand = nullptr;
    4293         668 :     bool bUseNoDataMask = false;
    4294         668 :     bool bCanUseCascaded = true;
    4295             : 
    4296         668 :     if (!STARTS_WITH_CI(pszResampling, "NEAR"))
    4297             :     {
    4298             :         // Special case if we are an alpha/mask band. We want it to be
    4299             :         // considered as the mask band to avoid alpha=0 to be taken into account
    4300             :         // in average computation.
    4301         464 :         if (poSrcBand->IsMaskBand())
    4302             :         {
    4303          88 :             poMaskBand = poSrcBand;
    4304          88 :             bUseNoDataMask = true;
    4305             :         }
    4306             :         else
    4307             :         {
    4308         376 :             poMaskBand = poSrcBand->GetMaskBand();
    4309         376 :             const int nMaskFlags = poSrcBand->GetMaskFlags();
    4310         376 :             bCanUseCascaded =
    4311         376 :                 (nMaskFlags == GMF_NODATA || nMaskFlags == GMF_ALL_VALID);
    4312         376 :             bUseNoDataMask = (nMaskFlags & GMF_ALL_VALID) == 0;
    4313             :         }
    4314             :     }
    4315             : 
    4316             :     /* -------------------------------------------------------------------- */
    4317             :     /*      If we are operating on multiple overviews, and using            */
    4318             :     /*      averaging, lets do them in cascading order to reduce the        */
    4319             :     /*      amount of computation.                                          */
    4320             :     /* -------------------------------------------------------------------- */
    4321             : 
    4322             :     // In case the mask made be computed from another band of the dataset,
    4323             :     // we can't use cascaded generation, as the computation of the overviews
    4324             :     // of the band used for the mask band may not have yet occurred (#3033).
    4325         668 :     if ((STARTS_WITH_CI(pszResampling, "AVER") ||
    4326         318 :          EQUAL(pszResampling, "GAUSS") || EQUAL(pszResampling, "RMS") ||
    4327         287 :          EQUAL(pszResampling, "CUBIC") || EQUAL(pszResampling, "CUBICSPLINE") ||
    4328         233 :          EQUAL(pszResampling, "LANCZOS") || EQUAL(pszResampling, "BILINEAR") ||
    4329         668 :          EQUAL(pszResampling, "MODE")) &&
    4330          42 :         nOverviewCount > 1 && bCanUseCascaded)
    4331          42 :         return GDALRegenerateCascadingOverviews(
    4332             :             poSrcBand, nOverviewCount, papoOvrBands, pszResampling, pfnProgress,
    4333          42 :             pProgressData, papszOptions);
    4334             : 
    4335             :     /* -------------------------------------------------------------------- */
    4336             :     /*      Setup one horizontal swath to read from the raw buffer.         */
    4337             :     /* -------------------------------------------------------------------- */
    4338         626 :     int nFRXBlockSize = 0;
    4339         626 :     int nFRYBlockSize = 0;
    4340         626 :     poSrcBand->GetBlockSize(&nFRXBlockSize, &nFRYBlockSize);
    4341             : 
    4342         626 :     const GDALDataType eSrcDataType = poSrcBand->GetRasterDataType();
    4343             :     const GDALDataType eWrkDataType =
    4344         626 :         GDALDataTypeIsComplex(eSrcDataType)
    4345         626 :             ? GDT_CFloat32
    4346         616 :             : GDALGetOvrWorkDataType(pszResampling, eSrcDataType);
    4347             : 
    4348         626 :     const int nWidth = poSrcBand->GetXSize();
    4349         626 :     const int nHeight = poSrcBand->GetYSize();
    4350             : 
    4351         626 :     int nMaxOvrFactor = 1;
    4352        1330 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    4353             :     {
    4354         704 :         const int nDstWidth = papoOvrBands[iOverview]->GetXSize();
    4355         704 :         const int nDstHeight = papoOvrBands[iOverview]->GetYSize();
    4356         704 :         nMaxOvrFactor = std::max(
    4357             :             nMaxOvrFactor,
    4358         704 :             static_cast<int>(static_cast<double>(nWidth) / nDstWidth + 0.5));
    4359         704 :         nMaxOvrFactor = std::max(
    4360             :             nMaxOvrFactor,
    4361         704 :             static_cast<int>(static_cast<double>(nHeight) / nDstHeight + 0.5));
    4362             :     }
    4363             : 
    4364         626 :     int nFullResYChunk = nFRYBlockSize;
    4365         626 :     int nMaxChunkYSizeQueried = 0;
    4366             : 
    4367             :     const auto UpdateChunkHeightAndGetChunkSize =
    4368        7903 :         [&nFullResYChunk, &nMaxChunkYSizeQueried, nKernelRadius, nMaxOvrFactor,
    4369       23709 :          eWrkDataType, nWidth]()
    4370             :     {
    4371             :         // Make sure that round(nChunkYOff / nMaxOvrFactor) < round((nChunkYOff
    4372             :         // + nFullResYChunk) / nMaxOvrFactor)
    4373        7903 :         nFullResYChunk = std::max(nFullResYChunk, 2 * nMaxOvrFactor);
    4374        7903 :         nMaxChunkYSizeQueried =
    4375        7903 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    4376        7903 :         return static_cast<GIntBig>(GDALGetDataTypeSizeBytes(eWrkDataType)) *
    4377        7903 :                nMaxChunkYSizeQueried * nWidth;
    4378         626 :     };
    4379             : 
    4380             :     // Only configurable for debug / testing
    4381             :     const char *pszChunkYSize =
    4382         626 :         CPLGetConfigOption("GDAL_OVR_CHUNKYSIZE", nullptr);
    4383         626 :     if (pszChunkYSize)
    4384             :     {
    4385             :         // coverity[tainted_data]
    4386           0 :         nFullResYChunk = atoi(pszChunkYSize);
    4387             :     }
    4388             : 
    4389             :     // Only configurable for debug / testing
    4390             :     const int nChunkMaxSize =
    4391         626 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    4392             : 
    4393         626 :     auto nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4394         626 :     if (nChunkSize > nChunkMaxSize)
    4395             :     {
    4396           3 :         if (poColorTable == nullptr && nFRXBlockSize < nWidth &&
    4397           9 :             !GDALDataTypeIsComplex(eSrcDataType) &&
    4398           3 :             (!STARTS_WITH_CI(pszResampling, "AVER") ||
    4399           0 :              EQUAL(pszResampling, "AVERAGE")))
    4400             :         {
    4401             :             // If this is tiled, then use GDALRegenerateOverviewsMultiBand()
    4402             :             // which use a block based strategy, which is much less memory
    4403             :             // hungry.
    4404           3 :             return GDALRegenerateOverviewsMultiBand(
    4405             :                 1, &poSrcBand, nOverviewCount, &papoOvrBands, pszResampling,
    4406           3 :                 pfnProgress, pProgressData, papszOptions);
    4407             :         }
    4408           0 :         else if (nOverviewCount > 1 && STARTS_WITH_CI(pszResampling, "NEAR"))
    4409             :         {
    4410           0 :             return GDALRegenerateCascadingOverviews(
    4411             :                 poSrcBand, nOverviewCount, papoOvrBands, pszResampling,
    4412           0 :                 pfnProgress, pProgressData, papszOptions);
    4413             :         }
    4414             :     }
    4415         623 :     else if (pszChunkYSize == nullptr)
    4416             :     {
    4417             :         // Try to get as close as possible to nChunkMaxSize
    4418        7900 :         while (nChunkSize * 2 < nChunkMaxSize)
    4419             :         {
    4420        7277 :             nFullResYChunk *= 2;
    4421        7277 :             nChunkSize = UpdateChunkHeightAndGetChunkSize();
    4422             :         }
    4423             :     }
    4424             : 
    4425         623 :     int nHasNoData = 0;
    4426         623 :     const double dfNoDataValue = poSrcBand->GetNoDataValue(&nHasNoData);
    4427         623 :     const bool bHasNoData = CPL_TO_BOOL(nHasNoData);
    4428             :     const bool bPropagateNoData =
    4429         623 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    4430             : 
    4431             :     // Structure describing a resampling job
    4432             :     struct OvrJob
    4433             :     {
    4434             :         // Buffers to free when job is finished
    4435             :         std::shared_ptr<PointerHolder> oSrcMaskBufferHolder{};
    4436             :         std::shared_ptr<PointerHolder> oSrcBufferHolder{};
    4437             :         std::unique_ptr<PointerHolder> oDstBufferHolder{};
    4438             : 
    4439             :         // Input parameters of pfnResampleFn
    4440             :         GDALResampleFunction pfnResampleFn = nullptr;
    4441             :         double dfXRatioDstToSrc{};
    4442             :         double dfYRatioDstToSrc{};
    4443             :         GDALDataType eWrkDataType = GDT_Unknown;
    4444             :         const void *pChunk = nullptr;
    4445             :         const GByte *pabyChunkNodataMask = nullptr;
    4446             :         int nWidth = 0;
    4447             :         int nHeight = 0;
    4448             :         int nChunkYOff = 0;
    4449             :         int nChunkYSize = 0;
    4450             :         int nDstWidth = 0;
    4451             :         int nDstYOff = 0;
    4452             :         int nDstYOff2 = 0;
    4453             :         GDALRasterBand *poDstBand = nullptr;
    4454             :         const char *pszResampling = nullptr;
    4455             :         bool bHasNoData = false;
    4456             :         double dfNoDataValue = 0.0;
    4457             :         GDALColorTable *poColorTable = nullptr;
    4458             :         GDALDataType eSrcDataType = GDT_Unknown;
    4459             :         bool bPropagateNoData = false;
    4460             : 
    4461             :         // Output values of resampling function
    4462             :         CPLErr eErr = CE_Failure;
    4463             :         void *pDstBuffer = nullptr;
    4464             :         GDALDataType eDstBufferDataType = GDT_Unknown;
    4465             : 
    4466             :         // Synchronization
    4467             :         bool bFinished = false;
    4468             :         std::mutex mutex{};
    4469             :         std::condition_variable cv{};
    4470             : 
    4471           0 :         void SetSrcMaskBufferHolder(
    4472             :             const std::shared_ptr<PointerHolder> &oSrcMaskBufferHolderIn)
    4473             :         {
    4474           0 :             oSrcMaskBufferHolder = oSrcMaskBufferHolderIn;
    4475           0 :         }
    4476             : 
    4477           0 :         void SetSrcBufferHolder(
    4478             :             const std::shared_ptr<PointerHolder> &oSrcBufferHolderIn)
    4479             :         {
    4480           0 :             oSrcBufferHolder = oSrcBufferHolderIn;
    4481           0 :         }
    4482             :     };
    4483             : 
    4484             :     // Thread function to resample
    4485         702 :     const auto JobResampleFunc = [](void *pData)
    4486             :     {
    4487         702 :         OvrJob *poJob = static_cast<OvrJob *>(pData);
    4488             : 
    4489         702 :         if (poJob->eWrkDataType != GDT_CFloat32)
    4490             :         {
    4491         692 :             poJob->eErr = poJob->pfnResampleFn(
    4492             :                 poJob->dfXRatioDstToSrc, poJob->dfYRatioDstToSrc, 0.0, 0.0,
    4493             :                 poJob->eWrkDataType, poJob->pChunk, poJob->pabyChunkNodataMask,
    4494             :                 0, poJob->nWidth, poJob->nChunkYOff, poJob->nChunkYSize, 0,
    4495             :                 poJob->nDstWidth, poJob->nDstYOff, poJob->nDstYOff2,
    4496             :                 poJob->poDstBand, &(poJob->pDstBuffer),
    4497             :                 &(poJob->eDstBufferDataType), poJob->pszResampling,
    4498         692 :                 poJob->bHasNoData, poJob->dfNoDataValue, poJob->poColorTable,
    4499         692 :                 poJob->eSrcDataType, poJob->bPropagateNoData);
    4500             :         }
    4501             :         else
    4502             :         {
    4503          10 :             poJob->eErr = GDALResampleChunkC32R(
    4504             :                 poJob->nWidth, poJob->nHeight,
    4505          10 :                 static_cast<const float *>(poJob->pChunk), poJob->nChunkYOff,
    4506             :                 poJob->nChunkYSize, poJob->nDstYOff, poJob->nDstYOff2,
    4507             :                 poJob->poDstBand, &(poJob->pDstBuffer),
    4508             :                 &(poJob->eDstBufferDataType), poJob->pszResampling);
    4509             :         }
    4510             : 
    4511             :         poJob->oDstBufferHolder =
    4512         702 :             std::make_unique<PointerHolder>(poJob->pDstBuffer);
    4513             : 
    4514             :         {
    4515        1404 :             std::lock_guard<std::mutex> guard(poJob->mutex);
    4516         702 :             poJob->bFinished = true;
    4517         702 :             poJob->cv.notify_one();
    4518             :         }
    4519         702 :     };
    4520             : 
    4521             :     // Function to write resample data to target band
    4522         702 :     const auto WriteJobData = [](const OvrJob *poJob)
    4523             :     {
    4524        1404 :         return poJob->poDstBand->RasterIO(
    4525         702 :             GF_Write, 0, poJob->nDstYOff, poJob->nDstWidth,
    4526         702 :             poJob->nDstYOff2 - poJob->nDstYOff, poJob->pDstBuffer,
    4527         702 :             poJob->nDstWidth, poJob->nDstYOff2 - poJob->nDstYOff,
    4528         702 :             poJob->eDstBufferDataType, 0, 0, nullptr);
    4529             :     };
    4530             : 
    4531             :     // Wait for completion of oldest job and serialize it
    4532             :     const auto WaitAndFinalizeOldestJob =
    4533           0 :         [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    4534             :     {
    4535           0 :         auto poOldestJob = jobList.front().get();
    4536             :         {
    4537           0 :             std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
    4538             :             // coverity[missing_lock:FALSE]
    4539           0 :             while (!poOldestJob->bFinished)
    4540             :             {
    4541           0 :                 poOldestJob->cv.wait(oGuard);
    4542             :             }
    4543             :         }
    4544           0 :         CPLErr l_eErr = poOldestJob->eErr;
    4545           0 :         if (l_eErr == CE_None)
    4546             :         {
    4547           0 :             l_eErr = WriteJobData(poOldestJob);
    4548             :         }
    4549             : 
    4550           0 :         jobList.pop_front();
    4551           0 :         return l_eErr;
    4552             :     };
    4553             : 
    4554             :     // Queue of jobs
    4555        1246 :     std::list<std::unique_ptr<OvrJob>> jobList;
    4556             : 
    4557         623 :     GByte *pabyChunkNodataMask = nullptr;
    4558         623 :     void *pChunk = nullptr;
    4559             : 
    4560         623 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    4561        2492 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    4562         623 :                                                        ? CPLGetNumCPUs()
    4563         623 :                                                        : atoi(pszThreads)));
    4564             :     auto poThreadPool =
    4565         623 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    4566             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    4567        1246 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    4568             : 
    4569             :     /* -------------------------------------------------------------------- */
    4570             :     /*      Loop over image operating on chunks.                            */
    4571             :     /* -------------------------------------------------------------------- */
    4572         623 :     int nChunkYOff = 0;
    4573         623 :     CPLErr eErr = CE_None;
    4574             : 
    4575        1251 :     for (nChunkYOff = 0; nChunkYOff < nHeight && eErr == CE_None;
    4576         628 :          nChunkYOff += nFullResYChunk)
    4577             :     {
    4578         628 :         if (!pfnProgress(nChunkYOff / static_cast<double>(nHeight), nullptr,
    4579             :                          pProgressData))
    4580             :         {
    4581           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    4582           0 :             eErr = CE_Failure;
    4583             :         }
    4584             : 
    4585         628 :         if (nFullResYChunk + nChunkYOff > nHeight)
    4586         621 :             nFullResYChunk = nHeight - nChunkYOff;
    4587             : 
    4588         628 :         int nChunkYOffQueried = nChunkYOff - nKernelRadius * nMaxOvrFactor;
    4589         628 :         int nChunkYSizeQueried =
    4590         628 :             nFullResYChunk + 2 * nKernelRadius * nMaxOvrFactor;
    4591         628 :         if (nChunkYOffQueried < 0)
    4592             :         {
    4593          62 :             nChunkYSizeQueried += nChunkYOffQueried;
    4594          62 :             nChunkYOffQueried = 0;
    4595             :         }
    4596         628 :         if (nChunkYOffQueried + nChunkYSizeQueried > nHeight)
    4597          62 :             nChunkYSizeQueried = nHeight - nChunkYOffQueried;
    4598             : 
    4599             :         // Avoid accumulating too many tasks and exhaust RAM
    4600             :         // Try to complete already finished jobs
    4601         628 :         while (eErr == CE_None && !jobList.empty())
    4602             :         {
    4603           0 :             auto poOldestJob = jobList.front().get();
    4604             :             {
    4605           0 :                 std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
    4606           0 :                 if (!poOldestJob->bFinished)
    4607             :                 {
    4608           0 :                     break;
    4609             :                 }
    4610             :             }
    4611           0 :             eErr = poOldestJob->eErr;
    4612           0 :             if (eErr == CE_None)
    4613             :             {
    4614           0 :                 eErr = WriteJobData(poOldestJob);
    4615             :             }
    4616             : 
    4617           0 :             jobList.pop_front();
    4618             :         }
    4619             : 
    4620             :         // And in case we have saturated the number of threads,
    4621             :         // wait for completion of tasks to go below the threshold.
    4622        1256 :         while (eErr == CE_None &&
    4623         628 :                jobList.size() >= static_cast<size_t>(nThreads))
    4624             :         {
    4625           0 :             eErr = WaitAndFinalizeOldestJob(jobList);
    4626             :         }
    4627             : 
    4628             :         // (Re)allocate buffers if needed
    4629         628 :         if (pChunk == nullptr)
    4630             :         {
    4631         623 :             pChunk = VSI_MALLOC3_VERBOSE(GDALGetDataTypeSizeBytes(eWrkDataType),
    4632             :                                          nMaxChunkYSizeQueried, nWidth);
    4633             :         }
    4634         628 :         if (bUseNoDataMask && pabyChunkNodataMask == nullptr)
    4635             :         {
    4636             :             pabyChunkNodataMask = static_cast<GByte *>(
    4637         265 :                 VSI_MALLOC2_VERBOSE(nMaxChunkYSizeQueried, nWidth));
    4638             :         }
    4639             : 
    4640         628 :         if (pChunk == nullptr ||
    4641         265 :             (bUseNoDataMask && pabyChunkNodataMask == nullptr))
    4642             :         {
    4643           0 :             CPLFree(pChunk);
    4644           0 :             CPLFree(pabyChunkNodataMask);
    4645           0 :             return CE_Failure;
    4646             :         }
    4647             : 
    4648             :         // Read chunk.
    4649         628 :         if (eErr == CE_None)
    4650         628 :             eErr = poSrcBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4651             :                                        nChunkYSizeQueried, pChunk, nWidth,
    4652             :                                        nChunkYSizeQueried, eWrkDataType, 0, 0,
    4653             :                                        nullptr);
    4654         628 :         if (eErr == CE_None && bUseNoDataMask)
    4655         265 :             eErr = poMaskBand->RasterIO(GF_Read, 0, nChunkYOffQueried, nWidth,
    4656             :                                         nChunkYSizeQueried, pabyChunkNodataMask,
    4657             :                                         nWidth, nChunkYSizeQueried, GDT_Byte, 0,
    4658             :                                         0, nullptr);
    4659             : 
    4660             :         // Special case to promote 1bit data to 8bit 0/255 values.
    4661         628 :         if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE"))
    4662             :         {
    4663           9 :             if (eWrkDataType == GDT_Float32)
    4664             :             {
    4665           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4666           0 :                 for (GPtrDiff_t i = 0;
    4667           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4668             :                      i++)
    4669             :                 {
    4670           0 :                     if (pafChunk[i] == 1.0)
    4671           0 :                         pafChunk[i] = 255.0;
    4672             :                 }
    4673             :             }
    4674           9 :             else if (eWrkDataType == GDT_Byte)
    4675             :             {
    4676           9 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4677      168417 :                 for (GPtrDiff_t i = 0;
    4678      168417 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4679             :                      i++)
    4680             :                 {
    4681      168408 :                     if (pabyChunk[i] == 1)
    4682      127437 :                         pabyChunk[i] = 255;
    4683             :                 }
    4684             :             }
    4685           0 :             else if (eWrkDataType == GDT_UInt16)
    4686             :             {
    4687           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4688           0 :                 for (GPtrDiff_t i = 0;
    4689           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4690             :                      i++)
    4691             :                 {
    4692           0 :                     if (pasChunk[i] == 1)
    4693           0 :                         pasChunk[i] = 255;
    4694             :                 }
    4695             :             }
    4696           0 :             else if (eWrkDataType == GDT_Float64)
    4697             :             {
    4698           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4699           0 :                 for (GPtrDiff_t i = 0;
    4700           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4701             :                      i++)
    4702             :                 {
    4703           0 :                     if (padfChunk[i] == 1.0)
    4704           0 :                         padfChunk[i] = 255.0;
    4705             :                 }
    4706             :             }
    4707             :             else
    4708             :             {
    4709           0 :                 CPLAssert(false);
    4710             :             }
    4711             :         }
    4712         619 :         else if (EQUAL(pszResampling, "AVERAGE_BIT2GRAYSCALE_MINISWHITE"))
    4713             :         {
    4714           0 :             if (eWrkDataType == GDT_Float32)
    4715             :             {
    4716           0 :                 float *pafChunk = static_cast<float *>(pChunk);
    4717           0 :                 for (GPtrDiff_t i = 0;
    4718           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4719             :                      i++)
    4720             :                 {
    4721           0 :                     if (pafChunk[i] == 1.0)
    4722           0 :                         pafChunk[i] = 0.0;
    4723           0 :                     else if (pafChunk[i] == 0.0)
    4724           0 :                         pafChunk[i] = 255.0;
    4725             :                 }
    4726             :             }
    4727           0 :             else if (eWrkDataType == GDT_Byte)
    4728             :             {
    4729           0 :                 GByte *pabyChunk = static_cast<GByte *>(pChunk);
    4730           0 :                 for (GPtrDiff_t i = 0;
    4731           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4732             :                      i++)
    4733             :                 {
    4734           0 :                     if (pabyChunk[i] == 1)
    4735           0 :                         pabyChunk[i] = 0;
    4736           0 :                     else if (pabyChunk[i] == 0)
    4737           0 :                         pabyChunk[i] = 255;
    4738             :                 }
    4739             :             }
    4740           0 :             else if (eWrkDataType == GDT_UInt16)
    4741             :             {
    4742           0 :                 GUInt16 *pasChunk = static_cast<GUInt16 *>(pChunk);
    4743           0 :                 for (GPtrDiff_t i = 0;
    4744           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4745             :                      i++)
    4746             :                 {
    4747           0 :                     if (pasChunk[i] == 1)
    4748           0 :                         pasChunk[i] = 0;
    4749           0 :                     else if (pasChunk[i] == 0)
    4750           0 :                         pasChunk[i] = 255;
    4751             :                 }
    4752             :             }
    4753           0 :             else if (eWrkDataType == GDT_Float64)
    4754             :             {
    4755           0 :                 double *padfChunk = static_cast<double *>(pChunk);
    4756           0 :                 for (GPtrDiff_t i = 0;
    4757           0 :                      i < static_cast<GPtrDiff_t>(nChunkYSizeQueried) * nWidth;
    4758             :                      i++)
    4759             :                 {
    4760           0 :                     if (padfChunk[i] == 1.0)
    4761           0 :                         padfChunk[i] = 0.0;
    4762           0 :                     else if (padfChunk[i] == 0.0)
    4763           0 :                         padfChunk[i] = 255.0;
    4764             :                 }
    4765             :             }
    4766             :             else
    4767             :             {
    4768           0 :                 CPLAssert(false);
    4769             :             }
    4770             :         }
    4771             : 
    4772             :         auto oSrcBufferHolder =
    4773        1256 :             std::make_shared<PointerHolder>(poJobQueue ? pChunk : nullptr);
    4774             :         auto oSrcMaskBufferHolder = std::make_shared<PointerHolder>(
    4775        1256 :             poJobQueue ? pabyChunkNodataMask : nullptr);
    4776             : 
    4777        1330 :         for (int iOverview = 0; iOverview < nOverviewCount && eErr == CE_None;
    4778             :              ++iOverview)
    4779             :         {
    4780         702 :             GDALRasterBand *poDstBand = papoOvrBands[iOverview];
    4781         702 :             const int nDstWidth = poDstBand->GetXSize();
    4782         702 :             const int nDstHeight = poDstBand->GetYSize();
    4783             : 
    4784         702 :             const double dfXRatioDstToSrc =
    4785         702 :                 static_cast<double>(nWidth) / nDstWidth;
    4786         702 :             const double dfYRatioDstToSrc =
    4787         702 :                 static_cast<double>(nHeight) / nDstHeight;
    4788             : 
    4789             :             /* --------------------------------------------------------------------
    4790             :              */
    4791             :             /*      Figure out the line to start writing to, and the first line
    4792             :              */
    4793             :             /*      to not write to.  In theory this approach should ensure that
    4794             :              */
    4795             :             /*      every output line will be written if all input chunks are */
    4796             :             /*      processed. */
    4797             :             /* --------------------------------------------------------------------
    4798             :              */
    4799         702 :             int nDstYOff =
    4800         702 :                 static_cast<int>(0.5 + nChunkYOff / dfYRatioDstToSrc);
    4801         702 :             if (nDstYOff == nDstHeight)
    4802           0 :                 continue;
    4803         702 :             int nDstYOff2 = static_cast<int>(
    4804         702 :                 0.5 + (nChunkYOff + nFullResYChunk) / dfYRatioDstToSrc);
    4805             : 
    4806         702 :             if (nChunkYOff + nFullResYChunk == nHeight)
    4807         695 :                 nDstYOff2 = nDstHeight;
    4808             : #if DEBUG_VERBOSE
    4809             :             CPLDebug("GDAL",
    4810             :                      "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)", 0,
    4811             :                      nChunkYOffQueried, nWidth, nChunkYSizeQueried, 0, nDstYOff,
    4812             :                      nDstWidth, nDstYOff2 - nDstYOff);
    4813             : #endif
    4814             : 
    4815        1404 :             auto poJob = std::unique_ptr<OvrJob>(new OvrJob());
    4816         702 :             poJob->pfnResampleFn = pfnResampleFn;
    4817         702 :             poJob->dfXRatioDstToSrc = dfXRatioDstToSrc;
    4818         702 :             poJob->dfYRatioDstToSrc = dfYRatioDstToSrc;
    4819         702 :             poJob->eWrkDataType = eWrkDataType;
    4820         702 :             poJob->pChunk = pChunk;
    4821         702 :             poJob->pabyChunkNodataMask = pabyChunkNodataMask;
    4822         702 :             poJob->nWidth = nWidth;
    4823         702 :             poJob->nHeight = nHeight;
    4824         702 :             poJob->nChunkYOff = nChunkYOffQueried;
    4825         702 :             poJob->nChunkYSize = nChunkYSizeQueried;
    4826         702 :             poJob->nDstWidth = nDstWidth;
    4827         702 :             poJob->nDstYOff = nDstYOff;
    4828         702 :             poJob->nDstYOff2 = nDstYOff2;
    4829         702 :             poJob->poDstBand = poDstBand;
    4830         702 :             poJob->pszResampling = pszResampling;
    4831         702 :             poJob->bHasNoData = bHasNoData;
    4832         702 :             poJob->dfNoDataValue = dfNoDataValue;
    4833         702 :             poJob->poColorTable = poColorTable;
    4834         702 :             poJob->eSrcDataType = eSrcDataType;
    4835         702 :             poJob->bPropagateNoData = bPropagateNoData;
    4836             : 
    4837         702 :             if (poJobQueue)
    4838             :             {
    4839           0 :                 poJob->SetSrcMaskBufferHolder(oSrcMaskBufferHolder);
    4840           0 :                 poJob->SetSrcBufferHolder(oSrcBufferHolder);
    4841           0 :                 poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    4842           0 :                 jobList.emplace_back(std::move(poJob));
    4843             :             }
    4844             :             else
    4845             :             {
    4846         702 :                 JobResampleFunc(poJob.get());
    4847         702 :                 eErr = poJob->eErr;
    4848         702 :                 if (eErr == CE_None)
    4849             :                 {
    4850         702 :                     eErr = WriteJobData(poJob.get());
    4851             :                 }
    4852             :             }
    4853             :         }
    4854             : 
    4855         628 :         if (poJobQueue)
    4856             :         {
    4857           0 :             pChunk = nullptr;
    4858           0 :             pabyChunkNodataMask = nullptr;
    4859             :         }
    4860             :     }
    4861             : 
    4862         623 :     VSIFree(pChunk);
    4863         623 :     VSIFree(pabyChunkNodataMask);
    4864             : 
    4865             :     // Wait for all pending jobs to complete
    4866         623 :     while (!jobList.empty())
    4867             :     {
    4868           0 :         const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    4869           0 :         if (l_eErr != CE_None && eErr == CE_None)
    4870           0 :             eErr = l_eErr;
    4871             :     }
    4872             : 
    4873             :     /* -------------------------------------------------------------------- */
    4874             :     /*      Renormalized overview mean / stddev if needed.                  */
    4875             :     /* -------------------------------------------------------------------- */
    4876         623 :     if (eErr == CE_None && EQUAL(pszResampling, "AVERAGE_MP"))
    4877             :     {
    4878           0 :         GDALOverviewMagnitudeCorrection(
    4879             :             poSrcBand, nOverviewCount,
    4880             :             reinterpret_cast<GDALRasterBandH *>(papoOvrBands),
    4881             :             GDALDummyProgress, nullptr);
    4882             :     }
    4883             : 
    4884             :     /* -------------------------------------------------------------------- */
    4885             :     /*      It can be important to flush out data to overviews.             */
    4886             :     /* -------------------------------------------------------------------- */
    4887        1318 :     for (int iOverview = 0; eErr == CE_None && iOverview < nOverviewCount;
    4888             :          ++iOverview)
    4889             :     {
    4890         695 :         eErr = papoOvrBands[iOverview]->FlushCache(false);
    4891             :     }
    4892             : 
    4893         623 :     if (eErr == CE_None)
    4894         623 :         pfnProgress(1.0, nullptr, pProgressData);
    4895             : 
    4896         623 :     return eErr;
    4897             : }
    4898             : 
    4899             : /************************************************************************/
    4900             : /*            GDALRegenerateOverviewsMultiBand()                        */
    4901             : /************************************************************************/
    4902             : 
    4903             : /**
    4904             :  * \brief Variant of GDALRegenerateOverviews, specially dedicated for generating
    4905             :  * compressed pixel-interleaved overviews (JPEG-IN-TIFF for example)
    4906             :  *
    4907             :  * This function will generate one or more overview images from a base
    4908             :  * image using the requested downsampling algorithm.  Its primary use
    4909             :  * is for generating overviews via GDALDataset::BuildOverviews(), but it
    4910             :  * can also be used to generate downsampled images in one file from another
    4911             :  * outside the overview architecture.
    4912             :  *
    4913             :  * The output bands need to exist in advance and share the same characteristics
    4914             :  * (type, dimensions)
    4915             :  *
    4916             :  * The resampling algorithms supported for the moment are "NEAREST", "AVERAGE",
    4917             :  * "RMS", "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" and "BILINEAR"
    4918             :  *
    4919             :  * It does not support color tables or complex data types.
    4920             :  *
    4921             :  * The pseudo-algorithm used by the function is :
    4922             :  *    for each overview
    4923             :  *       iterate on lines of the source by a step of deltay
    4924             :  *           iterate on columns of the source  by a step of deltax
    4925             :  *               read the source data of size deltax * deltay for all the bands
    4926             :  *               generate the corresponding overview block for all the bands
    4927             :  *
    4928             :  * This function will honour properly NODATA_VALUES tuples (special dataset
    4929             :  * metadata) so that only a given RGB triplet (in case of a RGB image) will be
    4930             :  * considered as the nodata value and not each value of the triplet
    4931             :  * independently per band.
    4932             :  *
    4933             :  * Starting with GDAL 3.2, the GDAL_NUM_THREADS configuration option can be set
    4934             :  * to "ALL_CPUS" or a integer value to specify the number of threads to use for
    4935             :  * overview computation.
    4936             :  *
    4937             :  * @param nBands the number of bands, size of papoSrcBands and size of
    4938             :  *               first dimension of papapoOverviewBands
    4939             :  * @param papoSrcBands the list of source bands to downsample
    4940             :  * @param nOverviews the number of downsampled overview levels being generated.
    4941             :  * @param papapoOverviewBands bidimension array of bands. First dimension is
    4942             :  *                            indexed by nBands. Second dimension is indexed by
    4943             :  *                            nOverviews.
    4944             :  * @param pszResampling Resampling algorithm ("NEAREST", "AVERAGE", "RMS",
    4945             :  * "GAUSS", "CUBIC", "CUBICSPLINE", "LANCZOS" or "BILINEAR").
    4946             :  * @param pfnProgress progress report function.
    4947             :  * @param pProgressData progress function callback data.
    4948             :  * @param papszOptions (GDAL >= 3.6) NULL terminated list of options as
    4949             :  *                     key=value pairs, or NULL
    4950             :  *                     Starting with GDAL 3.8, the XOFF, YOFF, XSIZE and YSIZE
    4951             :  *                     options can be specified to express that overviews should
    4952             :  *                     be regenerated only in the specified subset of the source
    4953             :  *                     dataset.
    4954             :  * @return CE_None on success or CE_Failure on failure.
    4955             :  */
    4956             : 
    4957         328 : CPLErr GDALRegenerateOverviewsMultiBand(
    4958             :     int nBands, GDALRasterBand *const *papoSrcBands, int nOverviews,
    4959             :     GDALRasterBand *const *const *papapoOverviewBands,
    4960             :     const char *pszResampling, GDALProgressFunc pfnProgress,
    4961             :     void *pProgressData, CSLConstList papszOptions)
    4962             : {
    4963         328 :     CPL_IGNORE_RET_VAL(papszOptions);
    4964             : 
    4965         328 :     if (pfnProgress == nullptr)
    4966           6 :         pfnProgress = GDALDummyProgress;
    4967             : 
    4968         328 :     if (EQUAL(pszResampling, "NONE"))
    4969           2 :         return CE_None;
    4970             : 
    4971             :     // Sanity checks.
    4972         326 :     if (!STARTS_WITH_CI(pszResampling, "NEAR") &&
    4973         165 :         !EQUAL(pszResampling, "RMS") && !EQUAL(pszResampling, "AVERAGE") &&
    4974          68 :         !EQUAL(pszResampling, "GAUSS") && !EQUAL(pszResampling, "CUBIC") &&
    4975          16 :         !EQUAL(pszResampling, "CUBICSPLINE") &&
    4976          15 :         !EQUAL(pszResampling, "LANCZOS") && !EQUAL(pszResampling, "BILINEAR") &&
    4977           5 :         !EQUAL(pszResampling, "MODE"))
    4978             :     {
    4979           0 :         CPLError(CE_Failure, CPLE_NotSupported,
    4980             :                  "GDALRegenerateOverviewsMultiBand: pszResampling='%s' "
    4981             :                  "not supported",
    4982             :                  pszResampling);
    4983           0 :         return CE_Failure;
    4984             :     }
    4985             : 
    4986         326 :     int nKernelRadius = 0;
    4987             :     GDALResampleFunction pfnResampleFn =
    4988         326 :         GDALGetResampleFunction(pszResampling, &nKernelRadius);
    4989         326 :     if (pfnResampleFn == nullptr)
    4990           0 :         return CE_Failure;
    4991             : 
    4992         326 :     const int nToplevelSrcWidth = papoSrcBands[0]->GetXSize();
    4993         326 :     const int nToplevelSrcHeight = papoSrcBands[0]->GetYSize();
    4994         326 :     if (nToplevelSrcWidth <= 0 || nToplevelSrcHeight <= 0)
    4995           0 :         return CE_None;
    4996         326 :     GDALDataType eDataType = papoSrcBands[0]->GetRasterDataType();
    4997         603 :     for (int iBand = 1; iBand < nBands; ++iBand)
    4998             :     {
    4999         554 :         if (papoSrcBands[iBand]->GetXSize() != nToplevelSrcWidth ||
    5000         277 :             papoSrcBands[iBand]->GetYSize() != nToplevelSrcHeight)
    5001             :         {
    5002           0 :             CPLError(
    5003             :                 CE_Failure, CPLE_NotSupported,
    5004             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5005             :                 "have the same dimensions");
    5006           0 :             return CE_Failure;
    5007             :         }
    5008         277 :         if (papoSrcBands[iBand]->GetRasterDataType() != eDataType)
    5009             :         {
    5010           0 :             CPLError(
    5011             :                 CE_Failure, CPLE_NotSupported,
    5012             :                 "GDALRegenerateOverviewsMultiBand: all the source bands must "
    5013             :                 "have the same data type");
    5014           0 :             return CE_Failure;
    5015             :         }
    5016             :     }
    5017             : 
    5018         884 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5019             :     {
    5020         558 :         const int nDstWidth = papapoOverviewBands[0][iOverview]->GetXSize();
    5021         558 :         const int nDstHeight = papapoOverviewBands[0][iOverview]->GetYSize();
    5022        1101 :         for (int iBand = 1; iBand < nBands; ++iBand)
    5023             :         {
    5024         543 :             if (papapoOverviewBands[iBand][iOverview]->GetXSize() !=
    5025        1086 :                     nDstWidth ||
    5026         543 :                 papapoOverviewBands[iBand][iOverview]->GetYSize() != nDstHeight)
    5027             :             {
    5028           0 :                 CPLError(
    5029             :                     CE_Failure, CPLE_NotSupported,
    5030             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5031             :                     "of the same level must have the same dimensions");
    5032           0 :                 return CE_Failure;
    5033             :             }
    5034         543 :             if (papapoOverviewBands[iBand][iOverview]->GetRasterDataType() !=
    5035             :                 eDataType)
    5036             :             {
    5037           0 :                 CPLError(
    5038             :                     CE_Failure, CPLE_NotSupported,
    5039             :                     "GDALRegenerateOverviewsMultiBand: all the overviews bands "
    5040             :                     "must have the same data type as the source bands");
    5041           0 :                 return CE_Failure;
    5042             :             }
    5043             :         }
    5044             :     }
    5045             : 
    5046             :     // First pass to compute the total number of pixels to write.
    5047         326 :     double dfTotalPixelCount = 0;
    5048         326 :     const int nSrcXOff = atoi(CSLFetchNameValueDef(papszOptions, "XOFF", "0"));
    5049         326 :     const int nSrcYOff = atoi(CSLFetchNameValueDef(papszOptions, "YOFF", "0"));
    5050         326 :     const int nSrcXSize = atoi(CSLFetchNameValueDef(
    5051             :         papszOptions, "XSIZE", CPLSPrintf("%d", nToplevelSrcWidth)));
    5052         326 :     const int nSrcYSize = atoi(CSLFetchNameValueDef(
    5053             :         papszOptions, "YSIZE", CPLSPrintf("%d", nToplevelSrcHeight)));
    5054         884 :     for (int iOverview = 0; iOverview < nOverviews; ++iOverview)
    5055             :     {
    5056         558 :         dfTotalPixelCount +=
    5057        1116 :             static_cast<double>(nSrcXSize) / nToplevelSrcWidth *
    5058         558 :             papapoOverviewBands[0][iOverview]->GetXSize() *
    5059        1116 :             static_cast<double>(nSrcYSize) / nToplevelSrcHeight *
    5060         558 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5061             :     }
    5062             : 
    5063             :     const GDALDataType eWrkDataType =
    5064         326 :         GDALGetOvrWorkDataType(pszResampling, eDataType);
    5065             : 
    5066         326 :     const bool bIsMask = papoSrcBands[0]->IsMaskBand();
    5067             : 
    5068             :     // If we have a nodata mask and we are doing something more complicated
    5069             :     // than nearest neighbouring, we have to fetch to nodata mask.
    5070             :     const bool bUseNoDataMask =
    5071         486 :         !STARTS_WITH_CI(pszResampling, "NEAR") &&
    5072         160 :         (bIsMask || (papoSrcBands[0]->GetMaskFlags() & GMF_ALL_VALID) == 0);
    5073             : 
    5074             :     bool *const pabHasNoData =
    5075         326 :         static_cast<bool *>(VSI_MALLOC_VERBOSE(nBands * sizeof(bool)));
    5076             :     double *const padfNoDataValue =
    5077         326 :         static_cast<double *>(VSI_MALLOC_VERBOSE(nBands * sizeof(double)));
    5078         326 :     if (pabHasNoData == nullptr || padfNoDataValue == nullptr)
    5079             :     {
    5080           0 :         CPLFree(pabHasNoData);
    5081           0 :         CPLFree(padfNoDataValue);
    5082           0 :         return CE_Failure;
    5083             :     }
    5084             : 
    5085         929 :     for (int iBand = 0; iBand < nBands; ++iBand)
    5086             :     {
    5087         603 :         int nHasNoData = 0;
    5088        1206 :         padfNoDataValue[iBand] =
    5089         603 :             papoSrcBands[iBand]->GetNoDataValue(&nHasNoData);
    5090         603 :         pabHasNoData[iBand] = CPL_TO_BOOL(nHasNoData);
    5091             :     }
    5092             :     const bool bPropagateNoData =
    5093         326 :         CPLTestBool(CPLGetConfigOption("GDAL_OVR_PROPAGATE_NODATA", "NO"));
    5094             : 
    5095         326 :     const char *pszThreads = CPLGetConfigOption("GDAL_NUM_THREADS", "1");
    5096        1304 :     const int nThreads = std::max(1, std::min(128, EQUAL(pszThreads, "ALL_CPUS")
    5097         326 :                                                        ? CPLGetNumCPUs()
    5098         326 :                                                        : atoi(pszThreads)));
    5099             :     auto poThreadPool =
    5100         326 :         nThreads > 1 ? GDALGetGlobalThreadPool(nThreads) : nullptr;
    5101             :     auto poJobQueue = poThreadPool ? poThreadPool->CreateJobQueue()
    5102         326 :                                    : std::unique_ptr<CPLJobQueue>(nullptr);
    5103             : 
    5104             :     // Only configurable for debug / testing
    5105             :     const int nChunkMaxSize =
    5106         326 :         atoi(CPLGetConfigOption("GDAL_OVR_CHUNK_MAX_SIZE", "10485760"));
    5107             : 
    5108             :     // Second pass to do the real job.
    5109         326 :     double dfCurPixelCount = 0;
    5110         326 :     CPLErr eErr = CE_None;
    5111         883 :     for (int iOverview = 0; iOverview < nOverviews && eErr == CE_None;
    5112             :          ++iOverview)
    5113             :     {
    5114         557 :         int iSrcOverview = -1;  // -1 means the source bands.
    5115             : 
    5116         557 :         int nDstChunkXSize = 0;
    5117         557 :         int nDstChunkYSize = 0;
    5118         557 :         papapoOverviewBands[0][iOverview]->GetBlockSize(&nDstChunkXSize,
    5119             :                                                         &nDstChunkYSize);
    5120             : 
    5121             :         const int nDstTotalWidth =
    5122         557 :             papapoOverviewBands[0][iOverview]->GetXSize();
    5123             :         const int nDstTotalHeight =
    5124         557 :             papapoOverviewBands[0][iOverview]->GetYSize();
    5125             : 
    5126             :         // Compute the coordinates of the target region to refresh
    5127         557 :         constexpr double EPS = 1e-8;
    5128         557 :         const int nDstXOffStart = static_cast<int>(
    5129         557 :             static_cast<double>(nSrcXOff) / nToplevelSrcWidth * nDstTotalWidth +
    5130             :             EPS);
    5131             :         const int nDstXOffEnd =
    5132        1114 :             std::min(static_cast<int>(
    5133         557 :                          std::ceil(static_cast<double>(nSrcXOff + nSrcXSize) /
    5134         557 :                                        nToplevelSrcWidth * nDstTotalWidth -
    5135             :                                    EPS)),
    5136         557 :                      nDstTotalWidth);
    5137         557 :         const int nDstWidth = nDstXOffEnd - nDstXOffStart;
    5138         557 :         const int nDstYOffStart =
    5139         557 :             static_cast<int>(static_cast<double>(nSrcYOff) /
    5140         557 :                                  nToplevelSrcHeight * nDstTotalHeight +
    5141             :                              EPS);
    5142             :         const int nDstYOffEnd =
    5143        1114 :             std::min(static_cast<int>(
    5144         557 :                          std::ceil(static_cast<double>(nSrcYOff + nSrcYSize) /
    5145         557 :                                        nToplevelSrcHeight * nDstTotalHeight -
    5146             :                                    EPS)),
    5147         557 :                      nDstTotalHeight);
    5148             : 
    5149             :         // Try to use previous level of overview as the source to compute
    5150             :         // the next level.
    5151         557 :         int nSrcWidth = nToplevelSrcWidth;
    5152         557 :         int nSrcHeight = nToplevelSrcHeight;
    5153         788 :         if (iOverview > 0 &&
    5154         231 :             papapoOverviewBands[0][iOverview - 1]->GetXSize() > nDstTotalWidth)
    5155             :         {
    5156         223 :             nSrcWidth = papapoOverviewBands[0][iOverview - 1]->GetXSize();
    5157         223 :             nSrcHeight = papapoOverviewBands[0][iOverview - 1]->GetYSize();
    5158         223 :             iSrcOverview = iOverview - 1;
    5159             :         }
    5160             : 
    5161         557 :         const double dfXRatioDstToSrc =
    5162         557 :             static_cast<double>(nSrcWidth) / nDstTotalWidth;
    5163         557 :         const double dfYRatioDstToSrc =
    5164         557 :             static_cast<double>(nSrcHeight) / nDstTotalHeight;
    5165             : 
    5166        1114 :         int nOvrFactor = std::max(static_cast<int>(0.5 + dfXRatioDstToSrc),
    5167         557 :                                   static_cast<int>(0.5 + dfYRatioDstToSrc));
    5168         557 :         if (nOvrFactor == 0)
    5169           0 :             nOvrFactor = 1;
    5170             : 
    5171             :         // Try to extend the chunk size so that the memory needed to acquire
    5172             :         // source pixels goes up to 10 MB.
    5173             :         // This can help for drivers that support multi-threaded reading
    5174         557 :         const int nFullResYChunk =
    5175         557 :             2 + static_cast<int>(nDstChunkYSize * dfYRatioDstToSrc);
    5176         557 :         const int nFullResYChunkQueried =
    5177         557 :             nFullResYChunk + 2 * nKernelRadius * nOvrFactor;
    5178         782 :         while (nDstChunkXSize < nDstWidth)
    5179             :         {
    5180         232 :             const int nFullResXChunk =
    5181         232 :                 2 + static_cast<int>(2 * nDstChunkXSize * dfXRatioDstToSrc);
    5182             : 
    5183         232 :             const int nFullResXChunkQueried =
    5184         232 :                 nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    5185             : 
    5186         464 :             if (static_cast<GIntBig>(nFullResXChunkQueried) *
    5187         464 :                     nFullResYChunkQueried * nBands *
    5188         232 :                     GDALGetDataTypeSizeBytes(eWrkDataType) >
    5189         232 :                 nChunkMaxSize)
    5190             :             {
    5191           7 :                 break;
    5192             :             }
    5193             : 
    5194         225 :             nDstChunkXSize *= 2;
    5195             :         }
    5196         557 :         nDstChunkXSize = std::min(nDstChunkXSize, nDstWidth);
    5197             : 
    5198         557 :         const int nFullResXChunk =
    5199         557 :             2 + static_cast<int>(nDstChunkXSize * dfXRatioDstToSrc);
    5200         557 :         const int nFullResXChunkQueried =
    5201         557 :             nFullResXChunk + 2 * nKernelRadius * nOvrFactor;
    5202             : 
    5203             :         // Structure describing a resampling job
    5204             :         struct OvrJob
    5205             :         {
    5206             :             // Buffers to free when job is finished
    5207             :             std::unique_ptr<PointerHolder> oSrcMaskBufferHolder{};
    5208             :             std::unique_ptr<PointerHolder> oSrcBufferHolder{};
    5209             :             std::unique_ptr<PointerHolder> oDstBufferHolder{};
    5210             : 
    5211             :             // Input parameters of pfnResampleFn
    5212             :             GDALResampleFunction pfnResampleFn = nullptr;
    5213             :             double dfXRatioDstToSrc{};
    5214             :             double dfYRatioDstToSrc{};
    5215             :             GDALDataType eWrkDataType = GDT_Unknown;
    5216             :             const void *pChunk = nullptr;
    5217             :             const GByte *pabyChunkNodataMask = nullptr;
    5218             :             int nChunkXOff = 0;
    5219             :             int nChunkXSize = 0;
    5220             :             int nChunkYOff = 0;
    5221             :             int nChunkYSize = 0;
    5222             :             int nDstXOff = 0;
    5223             :             int nDstXOff2 = 0;
    5224             :             int nDstYOff = 0;
    5225             :             int nDstYOff2 = 0;
    5226             :             GDALRasterBand *poOverview = nullptr;
    5227             :             const char *pszResampling = nullptr;
    5228             :             bool bHasNoData = false;
    5229             :             double dfNoDataValue = 0.0;
    5230             :             GDALDataType eSrcDataType = GDT_Unknown;
    5231             :             bool bPropagateNoData = false;
    5232             : 
    5233             :             // Output values of resampling function
    5234             :             CPLErr eErr = CE_Failure;
    5235             :             void *pDstBuffer = nullptr;
    5236             :             GDALDataType eDstBufferDataType = GDT_Unknown;
    5237             : 
    5238             :             // Synchronization
    5239             :             bool bFinished = false;
    5240             :             std::mutex mutex{};
    5241             :             std::condition_variable cv{};
    5242             :         };
    5243             : 
    5244             :         // Thread function to resample
    5245        3180 :         const auto JobResampleFunc = [](void *pData)
    5246             :         {
    5247        3180 :             OvrJob *poJob = static_cast<OvrJob *>(pData);
    5248             : 
    5249        6360 :             poJob->eErr = poJob->pfnResampleFn(
    5250             :                 poJob->dfXRatioDstToSrc, poJob->dfYRatioDstToSrc, 0.0, 0.0,
    5251             :                 poJob->eWrkDataType, poJob->pChunk, poJob->pabyChunkNodataMask,
    5252             :                 poJob->nChunkXOff, poJob->nChunkXSize, poJob->nChunkYOff,
    5253             :                 poJob->nChunkYSize, poJob->nDstXOff, poJob->nDstXOff2,
    5254             :                 poJob->nDstYOff, poJob->nDstYOff2, poJob->poOverview,
    5255             :                 &(poJob->pDstBuffer), &(poJob->eDstBufferDataType),
    5256        3180 :                 poJob->pszResampling, poJob->bHasNoData, poJob->dfNoDataValue,
    5257        3180 :                 nullptr, poJob->eSrcDataType, poJob->bPropagateNoData);
    5258             : 
    5259        3180 :             poJob->oDstBufferHolder.reset(new PointerHolder(poJob->pDstBuffer));
    5260             : 
    5261             :             {
    5262        6360 :                 std::lock_guard<std::mutex> guard(poJob->mutex);
    5263        3180 :                 poJob->bFinished = true;
    5264        3180 :                 poJob->cv.notify_one();
    5265             :             }
    5266        3180 :         };
    5267             : 
    5268             :         // Function to write resample data to target band
    5269        3180 :         const auto WriteJobData = [](const OvrJob *poJob)
    5270             :         {
    5271        6360 :             return poJob->poOverview->RasterIO(
    5272        3180 :                 GF_Write, poJob->nDstXOff, poJob->nDstYOff,
    5273        3180 :                 poJob->nDstXOff2 - poJob->nDstXOff,
    5274        3180 :                 poJob->nDstYOff2 - poJob->nDstYOff, poJob->pDstBuffer,
    5275        3180 :                 poJob->nDstXOff2 - poJob->nDstXOff,
    5276        3180 :                 poJob->nDstYOff2 - poJob->nDstYOff, poJob->eDstBufferDataType,
    5277        3180 :                 0, 0, nullptr);
    5278             :         };
    5279             : 
    5280             :         // Wait for completion of oldest job and serialize it
    5281             :         const auto WaitAndFinalizeOldestJob =
    5282          16 :             [WriteJobData](std::list<std::unique_ptr<OvrJob>> &jobList)
    5283             :         {
    5284          16 :             auto poOldestJob = jobList.front().get();
    5285             :             {
    5286          32 :                 std::unique_lock<std::mutex> oGuard(poOldestJob->mutex);
    5287             :                 // coverity[missing_lock:FALSE]
    5288          18 :                 while (!poOldestJob->bFinished)
    5289             :                 {
    5290           2 :                     poOldestJob->cv.wait(oGuard);
    5291             :                 }
    5292             :             }
    5293          16 :             CPLErr l_eErr = poOldestJob->eErr;
    5294          16 :             if (l_eErr == CE_None)
    5295             :             {
    5296          16 :                 l_eErr = WriteJobData(poOldestJob);
    5297             :             }
    5298             : 
    5299          16 :             jobList.pop_front();
    5300          16 :             return l_eErr;
    5301             :         };
    5302             : 
    5303             :         // Queue of jobs
    5304        1114 :         std::list<std::unique_ptr<OvrJob>> jobList;
    5305             : 
    5306        1114 :         std::vector<void *> apaChunk(nBands);
    5307        1114 :         std::vector<GByte *> apabyChunkNoDataMask(nBands);
    5308             : 
    5309             :         // Iterate on destination overview, block by block.
    5310         557 :         for (int nDstYOff = nDstYOffStart;
    5311        1976 :              nDstYOff < nDstYOffEnd && eErr == CE_None;
    5312        1419 :              nDstYOff += nDstChunkYSize)
    5313             :         {
    5314             :             int nDstYCount;
    5315        1419 :             if (nDstYOff + nDstChunkYSize <= nDstYOffEnd)
    5316        1052 :                 nDstYCount = nDstChunkYSize;
    5317             :             else
    5318         367 :                 nDstYCount = nDstYOffEnd - nDstYOff;
    5319             : 
    5320        1419 :             int nChunkYOff = static_cast<int>(nDstYOff * dfYRatioDstToSrc);
    5321        1419 :             int nChunkYOff2 = static_cast<int>(
    5322        1419 :                 ceil((nDstYOff + nDstYCount) * dfYRatioDstToSrc));
    5323        1419 :             if (nChunkYOff2 > nSrcHeight ||
    5324        1419 :                 nDstYOff + nDstYCount == nDstTotalHeight)
    5325         554 :                 nChunkYOff2 = nSrcHeight;
    5326        1419 :             int nYCount = nChunkYOff2 - nChunkYOff;
    5327        1419 :             CPLAssert(nYCount <= nFullResYChunk);
    5328             : 
    5329        1419 :             int nChunkYOffQueried = nChunkYOff - nKernelRadius * nOvrFactor;
    5330        1419 :             int nChunkYSizeQueried = nYCount + 2 * nKernelRadius * nOvrFactor;
    5331        1419 :             if (nChunkYOffQueried < 0)
    5332             :             {
    5333         120 :                 nChunkYSizeQueried += nChunkYOffQueried;
    5334         120 :                 nChunkYOffQueried = 0;
    5335             :             }
    5336        1419 :             if (nChunkYSizeQueried + nChunkYOffQueried > nSrcHeight)
    5337         119 :                 nChunkYSizeQueried = nSrcHeight - nChunkYOffQueried;
    5338        1419 :             CPLAssert(nChunkYSizeQueried <= nFullResYChunkQueried);
    5339             : 
    5340        1419 :             if (!pfnProgress(dfCurPixelCount / dfTotalPixelCount, nullptr,
    5341             :                              pProgressData))
    5342             :             {
    5343           1 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5344           1 :                 eErr = CE_Failure;
    5345             :             }
    5346             : 
    5347             :             // Iterate on destination overview, block by block.
    5348        1419 :             for (int nDstXOff = nDstXOffStart;
    5349        2883 :                  nDstXOff < nDstXOffEnd && eErr == CE_None;
    5350        1464 :                  nDstXOff += nDstChunkXSize)
    5351             :             {
    5352        1464 :                 int nDstXCount = 0;
    5353        1464 :                 if (nDstXOff + nDstChunkXSize <= nDstXOffEnd)
    5354        1447 :                     nDstXCount = nDstChunkXSize;
    5355             :                 else
    5356          17 :                     nDstXCount = nDstXOffEnd - nDstXOff;
    5357             : 
    5358        1464 :                 dfCurPixelCount += static_cast<double>(nDstXCount) * nDstYCount;
    5359             : 
    5360        1464 :                 int nChunkXOff = static_cast<int>(nDstXOff * dfXRatioDstToSrc);
    5361        1464 :                 int nChunkXOff2 = static_cast<int>(
    5362        1464 :                     ceil((nDstXOff + nDstXCount) * dfXRatioDstToSrc));
    5363        1464 :                 if (nChunkXOff2 > nSrcWidth ||
    5364        1464 :                     nDstXOff + nDstXCount == nDstTotalWidth)
    5365        1417 :                     nChunkXOff2 = nSrcWidth;
    5366        1464 :                 const int nXCount = nChunkXOff2 - nChunkXOff;
    5367        1464 :                 CPLAssert(nXCount <= nFullResXChunk);
    5368             : 
    5369        1464 :                 int nChunkXOffQueried = nChunkXOff - nKernelRadius * nOvrFactor;
    5370        1464 :                 int nChunkXSizeQueried =
    5371        1464 :                     nXCount + 2 * nKernelRadius * nOvrFactor;
    5372        1464 :                 if (nChunkXOffQueried < 0)
    5373             :                 {
    5374         172 :                     nChunkXSizeQueried += nChunkXOffQueried;
    5375         172 :                     nChunkXOffQueried = 0;
    5376             :                 }
    5377        1464 :                 if (nChunkXSizeQueried + nChunkXOffQueried > nSrcWidth)
    5378         175 :                     nChunkXSizeQueried = nSrcWidth - nChunkXOffQueried;
    5379        1464 :                 CPLAssert(nChunkXSizeQueried <= nFullResXChunkQueried);
    5380             : #if DEBUG_VERBOSE
    5381             :                 CPLDebug("GDAL",
    5382             :                          "Reading (%dx%d -> %dx%d) for output (%dx%d -> %dx%d)",
    5383             :                          nChunkXOffQueried, nChunkYOffQueried,
    5384             :                          nChunkXSizeQueried, nChunkYSizeQueried, nDstXOff,
    5385             :                          nDstYOff, nDstXCount, nDstYCount);
    5386             : #endif
    5387             : 
    5388             :                 // Avoid accumulating too many tasks and exhaust RAM
    5389             : 
    5390             :                 // Try to complete already finished jobs
    5391        1464 :                 while (eErr == CE_None && !jobList.empty())
    5392             :                 {
    5393           2 :                     auto poOldestJob = jobList.front().get();
    5394             :                     {
    5395           2 :                         std::lock_guard<std::mutex> oGuard(poOldestJob->mutex);
    5396           2 :                         if (!poOldestJob->bFinished)
    5397             :                         {
    5398           2 :                             break;
    5399             :                         }
    5400             :                     }
    5401           0 :                     eErr = poOldestJob->eErr;
    5402           0 :                     if (eErr == CE_None)
    5403             :                     {
    5404           0 :                         eErr = WriteJobData(poOldestJob);
    5405             :                     }
    5406             : 
    5407           0 :                     jobList.pop_front();
    5408             :                 }
    5409             : 
    5410             :                 // And in case we have saturated the number of threads,
    5411             :                 // wait for completion of tasks to go below the threshold.
    5412        2928 :                 while (eErr == CE_None &&
    5413        1464 :                        jobList.size() >= static_cast<size_t>(nThreads))
    5414             :                 {
    5415           0 :                     eErr = WaitAndFinalizeOldestJob(jobList);
    5416             :                 }
    5417             : 
    5418             :                 // (Re)allocate buffers if needed
    5419        4645 :                 for (int iBand = 0; iBand < nBands; ++iBand)
    5420             :                 {
    5421        3181 :                     if (apaChunk[iBand] == nullptr)
    5422             :                     {
    5423        1102 :                         apaChunk[iBand] = VSI_MALLOC3_VERBOSE(
    5424             :                             nFullResXChunkQueried, nFullResYChunkQueried,
    5425             :                             GDALGetDataTypeSizeBytes(eWrkDataType));
    5426        1102 :                         if (apaChunk[iBand] == nullptr)
    5427             :                         {
    5428           0 :                             eErr = CE_Failure;
    5429             :                         }
    5430             :                     }
    5431        3467 :                     if (bUseNoDataMask &&
    5432         286 :                         apabyChunkNoDataMask[iBand] == nullptr)
    5433             :                     {
    5434         486 :                         apabyChunkNoDataMask[iBand] =
    5435         243 :                             static_cast<GByte *>(VSI_MALLOC2_VERBOSE(
    5436             :                                 nFullResXChunkQueried, nFullResYChunkQueried));
    5437         243 :                         if (apabyChunkNoDataMask[iBand] == nullptr)
    5438             :                         {
    5439           0 :                             eErr = CE_Failure;
    5440             :                         }
    5441             :                     }
    5442             :                 }
    5443             : 
    5444             :                 // Read the source buffers for all the bands.
    5445        4645 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    5446             :                 {
    5447        3181 :                     GDALRasterBand *poSrcBand = nullptr;
    5448        3181 :                     if (iSrcOverview == -1)
    5449        2291 :                         poSrcBand = papoSrcBands[iBand];
    5450             :                     else
    5451         890 :                         poSrcBand = papapoOverviewBands[iBand][iSrcOverview];
    5452        3181 :                     eErr = poSrcBand->RasterIO(
    5453             :                         GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    5454        3181 :                         nChunkXSizeQueried, nChunkYSizeQueried, apaChunk[iBand],
    5455             :                         nChunkXSizeQueried, nChunkYSizeQueried, eWrkDataType, 0,
    5456             :                         0, nullptr);
    5457             : 
    5458        3181 :                     if (bUseNoDataMask && eErr == CE_None)
    5459             :                     {
    5460         286 :                         auto poMaskBand = poSrcBand->IsMaskBand()
    5461         286 :                                               ? poSrcBand
    5462         221 :                                               : poSrcBand->GetMaskBand();
    5463         286 :                         eErr = poMaskBand->RasterIO(
    5464             :                             GF_Read, nChunkXOffQueried, nChunkYOffQueried,
    5465             :                             nChunkXSizeQueried, nChunkYSizeQueried,
    5466         286 :                             apabyChunkNoDataMask[iBand], nChunkXSizeQueried,
    5467             :                             nChunkYSizeQueried, GDT_Byte, 0, 0, nullptr);
    5468             :                     }
    5469             :                 }
    5470             : 
    5471             :                 // Compute the resulting overview block.
    5472        4644 :                 for (int iBand = 0; iBand < nBands && eErr == CE_None; ++iBand)
    5473             :                 {
    5474        6360 :                     auto poJob = std::unique_ptr<OvrJob>(new OvrJob());
    5475        3180 :                     poJob->pfnResampleFn = pfnResampleFn;
    5476        3180 :                     poJob->dfXRatioDstToSrc = dfXRatioDstToSrc;
    5477        3180 :                     poJob->dfYRatioDstToSrc = dfYRatioDstToSrc;
    5478        3180 :                     poJob->eWrkDataType = eWrkDataType;
    5479        3180 :                     poJob->pChunk = apaChunk[iBand];
    5480        3180 :                     poJob->pabyChunkNodataMask = apabyChunkNoDataMask[iBand];
    5481        3180 :                     poJob->nChunkXOff = nChunkXOffQueried;
    5482        3180 :                     poJob->nChunkXSize = nChunkXSizeQueried;
    5483        3180 :                     poJob->nChunkYOff = nChunkYOffQueried;
    5484        3180 :                     poJob->nChunkYSize = nChunkYSizeQueried;
    5485        3180 :                     poJob->nDstXOff = nDstXOff;
    5486        3180 :                     poJob->nDstXOff2 = nDstXOff + nDstXCount;
    5487        3180 :                     poJob->nDstYOff = nDstYOff;
    5488        3180 :                     poJob->nDstYOff2 = nDstYOff + nDstYCount;
    5489        3180 :                     poJob->poOverview = papapoOverviewBands[iBand][iOverview];
    5490        3180 :                     poJob->pszResampling = pszResampling;
    5491        3180 :                     poJob->bHasNoData = pabHasNoData[iBand];
    5492        3180 :                     poJob->dfNoDataValue = padfNoDataValue[iBand];
    5493        3180 :                     poJob->eSrcDataType = eDataType;
    5494        3180 :                     poJob->bPropagateNoData = bPropagateNoData;
    5495             : 
    5496        3180 :                     if (poJobQueue)
    5497             :                     {
    5498          32 :                         poJob->oSrcMaskBufferHolder.reset(
    5499          16 :                             new PointerHolder(apabyChunkNoDataMask[iBand]));
    5500          16 :                         apabyChunkNoDataMask[iBand] = nullptr;
    5501             : 
    5502          32 :                         poJob->oSrcBufferHolder.reset(
    5503          16 :                             new PointerHolder(apaChunk[iBand]));
    5504          16 :                         apaChunk[iBand] = nullptr;
    5505             : 
    5506          16 :                         poJobQueue->SubmitJob(JobResampleFunc, poJob.get());
    5507          16 :                         jobList.emplace_back(std::move(poJob));
    5508             :                     }
    5509             :                     else
    5510             :                     {
    5511        3164 :                         JobResampleFunc(poJob.get());
    5512        3164 :                         eErr = poJob->eErr;
    5513        3164 :                         if (eErr == CE_None)
    5514             :                         {
    5515        3164 :                             eErr = WriteJobData(poJob.get());
    5516             :                         }
    5517             :                     }
    5518             :                 }
    5519             :             }
    5520             :         }
    5521             : 
    5522             :         // Wait for all pending jobs to complete
    5523         573 :         while (!jobList.empty())
    5524             :         {
    5525          16 :             const auto l_eErr = WaitAndFinalizeOldestJob(jobList);
    5526          16 :             if (l_eErr != CE_None && eErr == CE_None)
    5527           0 :                 eErr = l_eErr;
    5528             :         }
    5529             : 
    5530             :         // Flush the data to overviews.
    5531        1657 :         for (int iBand = 0; iBand < nBands; ++iBand)
    5532             :         {
    5533        1100 :             CPLFree(apaChunk[iBand]);
    5534        1100 :             papapoOverviewBands[iBand][iOverview]->FlushCache(false);
    5535             : 
    5536        1100 :             CPLFree(apabyChunkNoDataMask[iBand]);
    5537             :         }
    5538             :     }
    5539             : 
    5540         326 :     CPLFree(pabHasNoData);
    5541         326 :     CPLFree(padfNoDataValue);
    5542             : 
    5543         326 :     if (eErr == CE_None)
    5544         324 :         pfnProgress(1.0, nullptr, pProgressData);
    5545             : 
    5546         326 :     return eErr;
    5547             : }
    5548             : 
    5549             : /************************************************************************/
    5550             : /*                        GDALComputeBandStats()                        */
    5551             : /************************************************************************/
    5552             : 
    5553             : /** Undocumented
    5554             :  * @param hSrcBand undocumented.
    5555             :  * @param nSampleStep Step between scanlines used to compute statistics.
    5556             :  *                    When nSampleStep is equal to 1, all scanlines will
    5557             :  *                    be processed.
    5558             :  * @param pdfMean undocumented.
    5559             :  * @param pdfStdDev undocumented.
    5560             :  * @param pfnProgress undocumented.
    5561             :  * @param pProgressData undocumented.
    5562             :  * @return undocumented
    5563             :  */
    5564          16 : CPLErr CPL_STDCALL GDALComputeBandStats(GDALRasterBandH hSrcBand,
    5565             :                                         int nSampleStep, double *pdfMean,
    5566             :                                         double *pdfStdDev,
    5567             :                                         GDALProgressFunc pfnProgress,
    5568             :                                         void *pProgressData)
    5569             : 
    5570             : {
    5571          16 :     VALIDATE_POINTER1(hSrcBand, "GDALComputeBandStats", CE_Failure);
    5572             : 
    5573          16 :     GDALRasterBand *poSrcBand = GDALRasterBand::FromHandle(hSrcBand);
    5574             : 
    5575          16 :     if (pfnProgress == nullptr)
    5576          16 :         pfnProgress = GDALDummyProgress;
    5577             : 
    5578          16 :     const int nWidth = poSrcBand->GetXSize();
    5579          16 :     const int nHeight = poSrcBand->GetYSize();
    5580             : 
    5581          16 :     if (nSampleStep >= nHeight || nSampleStep < 1)
    5582           3 :         nSampleStep = 1;
    5583             : 
    5584          16 :     GDALDataType eWrkType = GDT_Unknown;
    5585          16 :     float *pafData = nullptr;
    5586          16 :     GDALDataType eType = poSrcBand->GetRasterDataType();
    5587          16 :     const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    5588          16 :     if (bComplex)
    5589             :     {
    5590             :         pafData = static_cast<float *>(
    5591           0 :             VSI_MALLOC_VERBOSE(nWidth * 2 * sizeof(float)));
    5592           0 :         eWrkType = GDT_CFloat32;
    5593             :     }
    5594             :     else
    5595             :     {
    5596             :         pafData =
    5597          16 :             static_cast<float *>(VSI_MALLOC_VERBOSE(nWidth * sizeof(float)));
    5598          16 :         eWrkType = GDT_Float32;
    5599             :     }
    5600             : 
    5601          16 :     if (nWidth == 0 || pafData == nullptr)
    5602             :     {
    5603           0 :         VSIFree(pafData);
    5604           0 :         return CE_Failure;
    5605             :     }
    5606             : 
    5607             :     /* -------------------------------------------------------------------- */
    5608             :     /*      Loop over all sample lines.                                     */
    5609             :     /* -------------------------------------------------------------------- */
    5610          16 :     double dfSum = 0.0;
    5611          16 :     double dfSum2 = 0.0;
    5612          16 :     int iLine = 0;
    5613          16 :     GIntBig nSamples = 0;
    5614             : 
    5615        2143 :     do
    5616             :     {
    5617        2159 :         if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    5618             :                          pProgressData))
    5619             :         {
    5620           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5621           0 :             CPLFree(pafData);
    5622           0 :             return CE_Failure;
    5623             :         }
    5624             : 
    5625             :         const CPLErr eErr =
    5626        2159 :             poSrcBand->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData, nWidth,
    5627             :                                 1, eWrkType, 0, 0, nullptr);
    5628        2159 :         if (eErr != CE_None)
    5629             :         {
    5630           1 :             CPLFree(pafData);
    5631           1 :             return eErr;
    5632             :         }
    5633             : 
    5634      725204 :         for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    5635             :         {
    5636      723046 :             float fValue = 0.0f;
    5637             : 
    5638      723046 :             if (bComplex)
    5639             :             {
    5640             :                 // Compute the magnitude of the complex value.
    5641             :                 fValue =
    5642           0 :                     std::hypot(pafData[iPixel * 2], pafData[iPixel * 2 + 1]);
    5643             :             }
    5644             :             else
    5645             :             {
    5646      723046 :                 fValue = pafData[iPixel];
    5647             :             }
    5648             : 
    5649      723046 :             dfSum += fValue;
    5650      723046 :             dfSum2 += static_cast<double>(fValue) * fValue;
    5651             :         }
    5652             : 
    5653        2158 :         nSamples += nWidth;
    5654        2158 :         iLine += nSampleStep;
    5655        2158 :     } while (iLine < nHeight);
    5656             : 
    5657          15 :     if (!pfnProgress(1.0, nullptr, pProgressData))
    5658             :     {
    5659           0 :         CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5660           0 :         CPLFree(pafData);
    5661           0 :         return CE_Failure;
    5662             :     }
    5663             : 
    5664             :     /* -------------------------------------------------------------------- */
    5665             :     /*      Produce the result values.                                      */
    5666             :     /* -------------------------------------------------------------------- */
    5667          15 :     if (pdfMean != nullptr)
    5668          15 :         *pdfMean = dfSum / nSamples;
    5669             : 
    5670          15 :     if (pdfStdDev != nullptr)
    5671             :     {
    5672          15 :         const double dfMean = dfSum / nSamples;
    5673             : 
    5674          15 :         *pdfStdDev = sqrt((dfSum2 / nSamples) - (dfMean * dfMean));
    5675             :     }
    5676             : 
    5677          15 :     CPLFree(pafData);
    5678             : 
    5679          15 :     return CE_None;
    5680             : }
    5681             : 
    5682             : /************************************************************************/
    5683             : /*                  GDALOverviewMagnitudeCorrection()                   */
    5684             : /*                                                                      */
    5685             : /*      Correct the mean and standard deviation of the overviews of     */
    5686             : /*      the given band to match the base layer approximately.           */
    5687             : /************************************************************************/
    5688             : 
    5689             : /** Undocumented
    5690             :  * @param hBaseBand undocumented.
    5691             :  * @param nOverviewCount undocumented.
    5692             :  * @param pahOverviews undocumented.
    5693             :  * @param pfnProgress undocumented.
    5694             :  * @param pProgressData undocumented.
    5695             :  * @return undocumented
    5696             :  */
    5697           0 : CPLErr GDALOverviewMagnitudeCorrection(GDALRasterBandH hBaseBand,
    5698             :                                        int nOverviewCount,
    5699             :                                        GDALRasterBandH *pahOverviews,
    5700             :                                        GDALProgressFunc pfnProgress,
    5701             :                                        void *pProgressData)
    5702             : 
    5703             : {
    5704           0 :     VALIDATE_POINTER1(hBaseBand, "GDALOverviewMagnitudeCorrection", CE_Failure);
    5705             : 
    5706             :     /* -------------------------------------------------------------------- */
    5707             :     /*      Compute mean/stddev for source raster.                          */
    5708             :     /* -------------------------------------------------------------------- */
    5709           0 :     double dfOrigMean = 0.0;
    5710           0 :     double dfOrigStdDev = 0.0;
    5711             :     {
    5712             :         const CPLErr eErr =
    5713           0 :             GDALComputeBandStats(hBaseBand, 2, &dfOrigMean, &dfOrigStdDev,
    5714             :                                  pfnProgress, pProgressData);
    5715             : 
    5716           0 :         if (eErr != CE_None)
    5717           0 :             return eErr;
    5718             :     }
    5719             : 
    5720             :     /* -------------------------------------------------------------------- */
    5721             :     /*      Loop on overview bands.                                         */
    5722             :     /* -------------------------------------------------------------------- */
    5723           0 :     for (int iOverview = 0; iOverview < nOverviewCount; ++iOverview)
    5724             :     {
    5725             :         GDALRasterBand *poOverview =
    5726           0 :             GDALRasterBand::FromHandle(pahOverviews[iOverview]);
    5727             :         double dfOverviewMean, dfOverviewStdDev;
    5728             : 
    5729             :         const CPLErr eErr =
    5730           0 :             GDALComputeBandStats(pahOverviews[iOverview], 1, &dfOverviewMean,
    5731             :                                  &dfOverviewStdDev, pfnProgress, pProgressData);
    5732             : 
    5733           0 :         if (eErr != CE_None)
    5734           0 :             return eErr;
    5735             : 
    5736           0 :         double dfGain = 1.0;
    5737           0 :         if (dfOrigStdDev >= 0.0001)
    5738           0 :             dfGain = dfOrigStdDev / dfOverviewStdDev;
    5739             : 
    5740             :         /* --------------------------------------------------------------------
    5741             :          */
    5742             :         /*      Apply gain and offset. */
    5743             :         /* --------------------------------------------------------------------
    5744             :          */
    5745           0 :         const int nWidth = poOverview->GetXSize();
    5746           0 :         const int nHeight = poOverview->GetYSize();
    5747             : 
    5748           0 :         GDALDataType eWrkType = GDT_Unknown;
    5749           0 :         float *pafData = nullptr;
    5750           0 :         const GDALDataType eType = poOverview->GetRasterDataType();
    5751           0 :         const bool bComplex = CPL_TO_BOOL(GDALDataTypeIsComplex(eType));
    5752           0 :         if (bComplex)
    5753             :         {
    5754             :             pafData = static_cast<float *>(
    5755           0 :                 VSI_MALLOC2_VERBOSE(nWidth, 2 * sizeof(float)));
    5756           0 :             eWrkType = GDT_CFloat32;
    5757             :         }
    5758             :         else
    5759             :         {
    5760             :             pafData = static_cast<float *>(
    5761           0 :                 VSI_MALLOC2_VERBOSE(nWidth, sizeof(float)));
    5762           0 :             eWrkType = GDT_Float32;
    5763             :         }
    5764             : 
    5765           0 :         if (pafData == nullptr)
    5766             :         {
    5767           0 :             return CE_Failure;
    5768             :         }
    5769             : 
    5770           0 :         for (int iLine = 0; iLine < nHeight; ++iLine)
    5771             :         {
    5772           0 :             if (!pfnProgress(iLine / static_cast<double>(nHeight), nullptr,
    5773             :                              pProgressData))
    5774             :             {
    5775           0 :                 CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5776           0 :                 CPLFree(pafData);
    5777           0 :                 return CE_Failure;
    5778             :             }
    5779             : 
    5780           0 :             if (poOverview->RasterIO(GF_Read, 0, iLine, nWidth, 1, pafData,
    5781             :                                      nWidth, 1, eWrkType, 0, 0,
    5782           0 :                                      nullptr) != CE_None)
    5783             :             {
    5784           0 :                 CPLFree(pafData);
    5785           0 :                 return CE_Failure;
    5786             :             }
    5787             : 
    5788           0 :             for (int iPixel = 0; iPixel < nWidth; ++iPixel)
    5789             :             {
    5790           0 :                 if (bComplex)
    5791             :                 {
    5792           0 :                     pafData[iPixel * 2] *= static_cast<float>(dfGain);
    5793           0 :                     pafData[iPixel * 2 + 1] *= static_cast<float>(dfGain);
    5794             :                 }
    5795             :                 else
    5796             :                 {
    5797           0 :                     pafData[iPixel] = static_cast<float>(
    5798           0 :                         (pafData[iPixel] - dfOverviewMean) * dfGain +
    5799             :                         dfOrigMean);
    5800             :                 }
    5801             :             }
    5802             : 
    5803           0 :             if (poOverview->RasterIO(GF_Write, 0, iLine, nWidth, 1, pafData,
    5804             :                                      nWidth, 1, eWrkType, 0, 0,
    5805           0 :                                      nullptr) != CE_None)
    5806             :             {
    5807           0 :                 CPLFree(pafData);
    5808           0 :                 return CE_Failure;
    5809             :             }
    5810             :         }
    5811             : 
    5812           0 :         if (!pfnProgress(1.0, nullptr, pProgressData))
    5813             :         {
    5814           0 :             CPLError(CE_Failure, CPLE_UserInterrupt, "User terminated");
    5815           0 :             CPLFree(pafData);
    5816           0 :             return CE_Failure;
    5817             :         }
    5818             : 
    5819           0 :         CPLFree(pafData);
    5820             :     }
    5821             : 
    5822           0 :     return CE_None;
    5823             : }

Generated by: LCOV version 1.14